howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28import math 29 30from howard.functions.commons import * 31from howard.objects.database import * 32from howard.functions.databases import * 33from howard.functions.utils import * 34 35 36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 fields_to_rename: dict | None = None 2100 ) -> bool: 2101 """ 2102 The `export_output` function exports data from a VCF file to various formats, including VCF, 2103 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2104 partitioning. 2105 2106 :param output_file: The `output_file` parameter is a string that specifies the name of the 2107 output file where the exported data will be saved 2108 :type output_file: str | None 2109 :param output_header: The `output_header` parameter is a string that specifies the name of the 2110 file where the header of the VCF file will be exported. If this parameter is not provided, the 2111 header will be exported to a file with the same name as the `output_file` parameter, but with 2112 the extension " 2113 :type output_header: str | None 2114 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2115 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2116 True, the header will be exported to a file. If `export_header` is False, the header will not 2117 be, defaults to True 2118 :type export_header: bool (optional) 2119 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2120 that can be used to filter and select specific data from the VCF file before exporting it. If 2121 provided, only the data that matches the query will be exported. This allows you to customize 2122 the exported data based on 2123 :type query: str | None 2124 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2125 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2126 organize data in a hierarchical directory structure based on the values of one or more columns. 2127 This can improve query performance when working with large datasets 2128 :type parquet_partitions: list | None 2129 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2130 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2131 multiple files. It helps in optimizing the export process by breaking down the data into 2132 manageable chunks for processing and storage 2133 :type chunk_size: int | None 2134 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2135 threads to be used during the export process. It determines the level of parallelism and can 2136 improve the performance of the export operation. If this parameter is not provided, the function 2137 will use the default number of threads 2138 :type threads: int | None 2139 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2140 determines whether the output file should be sorted based on genomic coordinates of the 2141 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2142 `False`,, defaults to False 2143 :type sort: bool (optional) 2144 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2145 determines whether an index should be created on the output file. If `index` is set to `True`, 2146 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2147 :type index: bool (optional) 2148 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2149 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2150 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2151 output file should be 2152 :type order_by: str | None 2153 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2154 mapping of field names to be renamed during the export process. This parameter allows you to 2155 customize the output field names before exporting the data. Each key-value pair in the 2156 dictionary represents the original field name as the key and the new field name 2157 :type fields_to_rename: dict | None 2158 :return: The `export_output` function returns a boolean value. It checks if the output file 2159 exists and returns True if it does, or None if it doesn't. 2160 """ 2161 2162 # Log 2163 log.info("Exporting...") 2164 2165 # Full path 2166 output_file = full_path(output_file) 2167 output_header = full_path(output_header) 2168 2169 # Config 2170 config = self.get_config() 2171 2172 # Param 2173 param = self.get_param() 2174 2175 # Tmp files to remove 2176 tmp_to_remove = [] 2177 2178 # If no output, get it 2179 if not output_file: 2180 output_file = self.get_output() 2181 2182 # If not threads 2183 if not threads: 2184 threads = self.get_threads() 2185 2186 # Rename fields 2187 if not fields_to_rename: 2188 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2189 self.rename_info_fields(fields_to_rename=fields_to_rename) 2190 2191 # Auto header name with extension 2192 if export_header or output_header: 2193 if not output_header: 2194 output_header = f"{output_file}.hdr" 2195 # Export header 2196 self.export_header(output_file=output_file) 2197 2198 # Switch off export header if VCF output 2199 output_file_type = get_file_format(output_file) 2200 if output_file_type in ["vcf"]: 2201 export_header = False 2202 tmp_to_remove.append(output_header) 2203 2204 # Chunk size 2205 if not chunk_size: 2206 chunk_size = config.get("chunk_size", None) 2207 2208 # Parquet partition 2209 if not parquet_partitions: 2210 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2211 if parquet_partitions and isinstance(parquet_partitions, str): 2212 parquet_partitions = parquet_partitions.split(",") 2213 2214 # Order by 2215 if not order_by: 2216 order_by = param.get("export", {}).get("order_by", "") 2217 2218 # Header in output 2219 header_in_output = param.get("export", {}).get("include_header", False) 2220 2221 # Database 2222 database_source = self.get_connexion() 2223 2224 # Connexion format 2225 connexion_format = self.get_connexion_format() 2226 2227 # Explode infos 2228 if self.get_explode_infos(): 2229 self.explode_infos( 2230 prefix=self.get_explode_infos_prefix(), 2231 fields=self.get_explode_infos_fields(), 2232 force=False, 2233 ) 2234 2235 # if connexion_format in ["sqlite"] or query: 2236 if connexion_format in ["sqlite"]: 2237 2238 # Export in Parquet 2239 random_tmp = "".join( 2240 random.choice(string.ascii_lowercase) for i in range(10) 2241 ) 2242 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2243 tmp_to_remove.append(database_source) 2244 2245 # Table Variants 2246 table_variants = self.get_table_variants() 2247 2248 # Create export query 2249 sql_query_export_subquery = f""" 2250 SELECT * FROM {table_variants} 2251 """ 2252 2253 # Write source file 2254 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2255 2256 # Create database 2257 database = Database( 2258 database=database_source, 2259 table="variants", 2260 header_file=output_header, 2261 conn_config=self.get_connexion_config(), 2262 ) 2263 2264 # Existing colomns header 2265 existing_columns_header = database.get_header_columns_from_database(query=query) 2266 2267 # Sample list 2268 if output_file_type in ["vcf"]: 2269 get_samples = self.get_samples() 2270 get_samples_check = self.get_samples_check() 2271 samples_force = get_samples is not None 2272 sample_list = self.get_header_sample_list( 2273 check=get_samples_check, 2274 samples=get_samples, 2275 samples_force=samples_force, 2276 ) 2277 else: 2278 sample_list = None 2279 2280 # Export file 2281 database.export( 2282 output_database=output_file, 2283 output_header=output_header, 2284 existing_columns_header=existing_columns_header, 2285 parquet_partitions=parquet_partitions, 2286 chunk_size=chunk_size, 2287 threads=threads, 2288 sort=sort, 2289 index=index, 2290 header_in_output=header_in_output, 2291 order_by=order_by, 2292 query=query, 2293 export_header=export_header, 2294 sample_list=sample_list, 2295 ) 2296 2297 # Remove 2298 remove_if_exists(tmp_to_remove) 2299 2300 return (os.path.exists(output_file) or None) and ( 2301 os.path.exists(output_file) or None 2302 ) 2303 2304 def get_extra_infos(self, table: str = None) -> list: 2305 """ 2306 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2307 in the header. 2308 2309 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2310 name of the table from which you want to retrieve the extra columns that are not present in the 2311 header. If the `table` parameter is not provided when calling the function, it will default to 2312 using the variants 2313 :type table: str 2314 :return: A list of columns that are in the specified table but not in the header of the table. 2315 """ 2316 2317 header_columns = [] 2318 2319 if not table: 2320 table = self.get_table_variants(clause="from") 2321 header_columns = self.get_header_columns() 2322 2323 # Check all columns in the database 2324 query = f""" SELECT * FROM {table} LIMIT 1 """ 2325 log.debug(f"query {query}") 2326 table_columns = self.get_query_to_df(query).columns.tolist() 2327 extra_columns = [] 2328 2329 # Construct extra infos (not in header) 2330 for column in table_columns: 2331 if column not in header_columns: 2332 extra_columns.append(column) 2333 2334 return extra_columns 2335 2336 def get_extra_infos_sql(self, table: str = None) -> str: 2337 """ 2338 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2339 by double quotes 2340 2341 :param table: The name of the table to get the extra infos from. If None, the default table is 2342 used 2343 :type table: str 2344 :return: A string of the extra infos 2345 """ 2346 2347 return ", ".join( 2348 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2349 ) 2350 2351 def export_header( 2352 self, 2353 header_name: str = None, 2354 output_file: str = None, 2355 output_file_ext: str = ".hdr", 2356 clean_header: bool = True, 2357 remove_chrom_line: bool = False, 2358 ) -> str: 2359 """ 2360 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2361 specified options, and writes it to a new file. 2362 2363 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2364 this parameter is not specified, the header will be written to the output file 2365 :type header_name: str 2366 :param output_file: The `output_file` parameter in the `export_header` function is used to 2367 specify the name of the output file where the header will be written. If this parameter is not 2368 provided, the header will be written to a temporary file 2369 :type output_file: str 2370 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2371 string that represents the extension of the output header file. By default, it is set to ".hdr" 2372 if not specified by the user. This extension will be appended to the `output_file` name to 2373 create the final, defaults to .hdr 2374 :type output_file_ext: str (optional) 2375 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2376 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2377 `True`, the function will clean the header by modifying certain lines based on a specific 2378 pattern. If `clean_header`, defaults to True 2379 :type clean_header: bool (optional) 2380 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2381 boolean flag that determines whether the #CHROM line should be removed from the header before 2382 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2383 defaults to False 2384 :type remove_chrom_line: bool (optional) 2385 :return: The function `export_header` returns the name of the temporary header file that is 2386 created. 2387 """ 2388 2389 if not header_name and not output_file: 2390 output_file = self.get_output() 2391 2392 if self.get_header(): 2393 2394 # Get header object 2395 header_obj = self.get_header() 2396 2397 # Create database 2398 db_for_header = Database(database=self.get_input()) 2399 2400 # Get real columns in the file 2401 db_header_columns = db_for_header.get_columns() 2402 2403 with tempfile.TemporaryDirectory() as tmpdir: 2404 2405 # Write header file 2406 header_file_tmp = os.path.join(tmpdir, "header") 2407 f = open(header_file_tmp, "w") 2408 vcf.Writer(f, header_obj) 2409 f.close() 2410 2411 # Replace #CHROM line with rel columns 2412 header_list = db_for_header.read_header_file( 2413 header_file=header_file_tmp 2414 ) 2415 header_list[-1] = "\t".join(db_header_columns) 2416 2417 # Remove CHROM line 2418 if remove_chrom_line: 2419 header_list.pop() 2420 2421 # Clean header 2422 if clean_header: 2423 header_list_clean = [] 2424 for head in header_list: 2425 # Clean head for malformed header 2426 head_clean = head 2427 head_clean = re.subn( 2428 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2429 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2430 head_clean, 2431 2, 2432 )[0] 2433 # Write header 2434 header_list_clean.append(head_clean) 2435 header_list = header_list_clean 2436 2437 tmp_header_name = output_file + output_file_ext 2438 2439 f = open(tmp_header_name, "w") 2440 for line in header_list: 2441 f.write(line) 2442 f.close() 2443 2444 return tmp_header_name 2445 2446 def export_variant_vcf( 2447 self, 2448 vcf_file, 2449 remove_info: bool = False, 2450 add_samples: bool = True, 2451 list_samples: list = [], 2452 where_clause: str = "", 2453 index: bool = False, 2454 threads: int | None = None, 2455 ) -> bool | None: 2456 """ 2457 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2458 remove INFO field, add samples, and control compression and indexing. 2459 2460 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2461 written to. It is the output file that will contain the filtered VCF data based on the specified 2462 parameters 2463 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2464 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2465 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2466 in, defaults to False 2467 :type remove_info: bool (optional) 2468 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2469 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2470 If set to False, the samples will be removed. The default value is True, defaults to True 2471 :type add_samples: bool (optional) 2472 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2473 in the output VCF file. By default, all samples will be included. If you provide a list of 2474 samples, only those samples will be included in the output file 2475 :type list_samples: list 2476 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2477 determines whether or not to create an index for the output VCF file. If `index` is set to 2478 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2479 :type index: bool (optional) 2480 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2481 number of threads to use for exporting the VCF file. It determines how many parallel threads 2482 will be used during the export process. More threads can potentially speed up the export process 2483 by utilizing multiple cores of the processor. If 2484 :type threads: int | None 2485 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2486 method with various parameters including the output file, query, threads, sort flag, and index 2487 flag. The `export_output` method is responsible for exporting the VCF data based on the 2488 specified parameters and configurations provided in the `export_variant_vcf` function. 2489 """ 2490 2491 # Config 2492 config = self.get_config() 2493 2494 # Extract VCF 2495 log.debug("Export VCF...") 2496 2497 # Table variants 2498 table_variants = self.get_table_variants() 2499 2500 # Threads 2501 if not threads: 2502 threads = self.get_threads() 2503 2504 # Info fields 2505 if remove_info: 2506 if not isinstance(remove_info, str): 2507 remove_info = "." 2508 info_field = f"""'{remove_info}' as INFO""" 2509 else: 2510 info_field = "INFO" 2511 2512 # Samples fields 2513 if add_samples: 2514 if not list_samples: 2515 list_samples = self.get_header_sample_list() 2516 if list_samples: 2517 samples_fields = " , FORMAT , " + " , ".join( 2518 [f""" "{sample}" """ for sample in list_samples] 2519 ) 2520 else: 2521 samples_fields = "" 2522 log.debug(f"samples_fields: {samples_fields}") 2523 else: 2524 samples_fields = "" 2525 2526 # Where clause 2527 if where_clause is None: 2528 where_clause = "" 2529 2530 # Variants 2531 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2532 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2533 log.debug(f"sql_query_select={sql_query_select}") 2534 2535 return self.export_output( 2536 output_file=vcf_file, 2537 output_header=None, 2538 export_header=True, 2539 query=sql_query_select, 2540 parquet_partitions=None, 2541 chunk_size=config.get("chunk_size", None), 2542 threads=threads, 2543 sort=True, 2544 index=index, 2545 order_by=None, 2546 ) 2547 2548 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2549 """ 2550 It takes a list of commands and runs them in parallel using the number of threads specified 2551 2552 :param commands: A list of commands to run 2553 :param threads: The number of threads to use, defaults to 1 (optional) 2554 """ 2555 2556 run_parallel_commands(commands, threads) 2557 2558 def get_threads(self, default: int = 1) -> int: 2559 """ 2560 This function returns the number of threads to use for a job, with a default value of 1 if not 2561 specified. 2562 2563 :param default: The `default` parameter in the `get_threads` method is used to specify the 2564 default number of threads to use if no specific value is provided. If no value is provided for 2565 the `threads` parameter in the configuration or input parameters, the `default` value will be 2566 used, defaults to 1 2567 :type default: int (optional) 2568 :return: the number of threads to use for the current job. 2569 """ 2570 2571 # Config 2572 config = self.get_config() 2573 2574 # Param 2575 param = self.get_param() 2576 2577 # Input threads 2578 input_thread = param.get("threads", config.get("threads", None)) 2579 2580 # Check threads 2581 if not input_thread: 2582 threads = default 2583 elif int(input_thread) <= 0: 2584 threads = os.cpu_count() 2585 else: 2586 threads = int(input_thread) 2587 return threads 2588 2589 def get_memory(self, default: str = None) -> str: 2590 """ 2591 This function retrieves the memory value from parameters or configuration with a default value 2592 if not found. 2593 2594 :param default: The `get_memory` function takes in a default value as a string parameter. This 2595 default value is used as a fallback in case the `memory` parameter is not provided in the 2596 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2597 the function 2598 :type default: str 2599 :return: The `get_memory` function returns a string value representing the memory parameter. If 2600 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2601 return the default value provided as an argument to the function. 2602 """ 2603 2604 # Config 2605 config = self.get_config() 2606 2607 # Param 2608 param = self.get_param() 2609 2610 # Input threads 2611 input_memory = param.get("memory", config.get("memory", None)) 2612 2613 # Check threads 2614 if input_memory: 2615 memory = input_memory 2616 else: 2617 memory = default 2618 2619 return memory 2620 2621 def update_from_vcf(self, vcf_file: str) -> None: 2622 """ 2623 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2624 2625 :param vcf_file: the path to the VCF file 2626 """ 2627 2628 connexion_format = self.get_connexion_format() 2629 2630 if connexion_format in ["duckdb"]: 2631 self.update_from_vcf_duckdb(vcf_file) 2632 elif connexion_format in ["sqlite"]: 2633 self.update_from_vcf_sqlite(vcf_file) 2634 2635 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2636 """ 2637 It takes a VCF file and updates the INFO column of the variants table in the database with the 2638 INFO column of the VCF file 2639 2640 :param vcf_file: the path to the VCF file 2641 """ 2642 2643 # varaints table 2644 table_variants = self.get_table_variants() 2645 2646 # Loading VCF into temporaire table 2647 skip = self.get_header_length(file=vcf_file) 2648 vcf_df = pd.read_csv( 2649 vcf_file, 2650 sep="\t", 2651 engine="c", 2652 skiprows=skip, 2653 header=0, 2654 low_memory=False, 2655 ) 2656 sql_query_update = f""" 2657 UPDATE {table_variants} as table_variants 2658 SET INFO = concat( 2659 CASE 2660 WHEN INFO NOT IN ('', '.') 2661 THEN INFO 2662 ELSE '' 2663 END, 2664 ( 2665 SELECT 2666 concat( 2667 CASE 2668 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2669 THEN ';' 2670 ELSE '' 2671 END 2672 , 2673 CASE 2674 WHEN table_parquet.INFO NOT IN ('','.') 2675 THEN table_parquet.INFO 2676 ELSE '' 2677 END 2678 ) 2679 FROM vcf_df as table_parquet 2680 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2681 AND table_parquet.\"POS\" = table_variants.\"POS\" 2682 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2683 AND table_parquet.\"REF\" = table_variants.\"REF\" 2684 AND table_parquet.INFO NOT IN ('','.') 2685 ) 2686 ) 2687 ; 2688 """ 2689 self.conn.execute(sql_query_update) 2690 2691 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2692 """ 2693 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2694 table, then updates the INFO column of the variants table with the INFO column of the temporary 2695 table 2696 2697 :param vcf_file: The path to the VCF file you want to update the database with 2698 """ 2699 2700 # Create a temporary table for the VCF 2701 table_vcf = "tmp_vcf" 2702 sql_create = ( 2703 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2704 ) 2705 self.conn.execute(sql_create) 2706 2707 # Loading VCF into temporaire table 2708 vcf_df = pd.read_csv( 2709 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2710 ) 2711 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2712 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2713 2714 # Update table 'variants' with VCF data 2715 # warning: CONCAT as || operator 2716 sql_query_update = f""" 2717 UPDATE variants as table_variants 2718 SET INFO = CASE 2719 WHEN INFO NOT IN ('', '.') 2720 THEN INFO 2721 ELSE '' 2722 END || 2723 ( 2724 SELECT 2725 CASE 2726 WHEN table_variants.INFO NOT IN ('','.') 2727 AND table_vcf.INFO NOT IN ('','.') 2728 THEN ';' 2729 ELSE '' 2730 END || 2731 CASE 2732 WHEN table_vcf.INFO NOT IN ('','.') 2733 THEN table_vcf.INFO 2734 ELSE '' 2735 END 2736 FROM {table_vcf} as table_vcf 2737 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2738 AND table_vcf.\"POS\" = table_variants.\"POS\" 2739 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2740 AND table_vcf.\"REF\" = table_variants.\"REF\" 2741 ) 2742 """ 2743 self.conn.execute(sql_query_update) 2744 2745 # Drop temporary table 2746 sql_drop = f"DROP TABLE {table_vcf}" 2747 self.conn.execute(sql_drop) 2748 2749 def drop_variants_table(self) -> None: 2750 """ 2751 > This function drops the variants table 2752 """ 2753 2754 table_variants = self.get_table_variants() 2755 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2756 self.conn.execute(sql_table_variants) 2757 2758 def set_variant_id( 2759 self, variant_id_column: str = "variant_id", force: bool = None 2760 ) -> str: 2761 """ 2762 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2763 `#CHROM`, `POS`, `REF`, and `ALT` columns 2764 2765 :param variant_id_column: The name of the column to be created in the variants table, defaults 2766 to variant_id 2767 :type variant_id_column: str (optional) 2768 :param force: If True, the variant_id column will be created even if it already exists 2769 :type force: bool 2770 :return: The name of the column that contains the variant_id 2771 """ 2772 2773 # Assembly 2774 assembly = self.get_param().get( 2775 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2776 ) 2777 2778 # INFO/Tag prefix 2779 prefix = self.get_explode_infos_prefix() 2780 2781 # Explode INFO/SVTYPE 2782 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2783 2784 # variants table 2785 table_variants = self.get_table_variants() 2786 2787 # variant_id column 2788 if not variant_id_column: 2789 variant_id_column = "variant_id" 2790 2791 # Creta variant_id column 2792 if "variant_id" not in self.get_extra_infos() or force: 2793 2794 # Create column 2795 self.add_column( 2796 table_name=table_variants, 2797 column_name=variant_id_column, 2798 column_type="UBIGINT", 2799 default_value="0", 2800 ) 2801 2802 # Update column 2803 self.conn.execute( 2804 f""" 2805 UPDATE {table_variants} 2806 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2807 """ 2808 ) 2809 2810 # Remove added columns 2811 for added_column in added_columns: 2812 self.drop_column(column=added_column) 2813 2814 # return variant_id column name 2815 return variant_id_column 2816 2817 def get_variant_id_column( 2818 self, variant_id_column: str = "variant_id", force: bool = None 2819 ) -> str: 2820 """ 2821 This function returns the variant_id column name 2822 2823 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2824 defaults to variant_id 2825 :type variant_id_column: str (optional) 2826 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2827 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2828 if it is not already set, or if it is set 2829 :type force: bool 2830 :return: The variant_id column name. 2831 """ 2832 2833 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2834 2835 ### 2836 # Annotation 2837 ### 2838 2839 def scan_databases( 2840 self, 2841 database_formats: list = ["parquet"], 2842 database_releases: list = ["current"], 2843 ) -> dict: 2844 """ 2845 The function `scan_databases` scans for available databases based on specified formats and 2846 releases. 2847 2848 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2849 of the databases to be scanned. In this case, the accepted format is "parquet" 2850 :type database_formats: list ["parquet"] 2851 :param database_releases: The `database_releases` parameter is a list that specifies the 2852 releases of the databases to be scanned. In the provided function, the default value for 2853 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2854 databases that are in the "current" 2855 :type database_releases: list 2856 :return: The function `scan_databases` returns a dictionary containing information about 2857 databases that match the specified formats and releases. 2858 """ 2859 2860 # Config 2861 config = self.get_config() 2862 2863 # Param 2864 param = self.get_param() 2865 2866 # Param - Assembly 2867 assembly = param.get("assembly", config.get("assembly", None)) 2868 if not assembly: 2869 assembly = DEFAULT_ASSEMBLY 2870 log.warning(f"Default assembly '{assembly}'") 2871 2872 # Scan for availabled databases 2873 log.info( 2874 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2875 ) 2876 databases_infos_dict = databases_infos( 2877 database_folder_releases=database_releases, 2878 database_formats=database_formats, 2879 assembly=assembly, 2880 config=config, 2881 ) 2882 log.info( 2883 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2884 ) 2885 2886 return databases_infos_dict 2887 2888 def annotation(self) -> None: 2889 """ 2890 It annotates the VCF file with the annotations specified in the config file. 2891 """ 2892 2893 # Config 2894 config = self.get_config() 2895 2896 # Param 2897 param = self.get_param() 2898 2899 # Param - Assembly 2900 assembly = param.get("assembly", config.get("assembly", None)) 2901 if not assembly: 2902 assembly = DEFAULT_ASSEMBLY 2903 log.warning(f"Default assembly '{assembly}'") 2904 2905 # annotations databases folders 2906 annotations_databases = set( 2907 config.get("folders", {}) 2908 .get("databases", {}) 2909 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2910 + config.get("folders", {}) 2911 .get("databases", {}) 2912 .get("parquet", ["~/howard/databases/parquet/current"]) 2913 + config.get("folders", {}) 2914 .get("databases", {}) 2915 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2916 ) 2917 2918 # Get param annotations 2919 if param.get("annotations", None) and isinstance( 2920 param.get("annotations", None), str 2921 ): 2922 log.debug(param.get("annotations", None)) 2923 param_annotation_list = param.get("annotations").split(",") 2924 else: 2925 param_annotation_list = [] 2926 2927 # Each tools param 2928 if param.get("annotation_parquet", None) != None: 2929 log.debug( 2930 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2931 ) 2932 if isinstance(param.get("annotation_parquet", None), list): 2933 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2934 else: 2935 param_annotation_list.append(param.get("annotation_parquet")) 2936 if param.get("annotation_snpsift", None) != None: 2937 if isinstance(param.get("annotation_snpsift", None), list): 2938 param_annotation_list.append( 2939 "snpsift:" 2940 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2941 ) 2942 else: 2943 param_annotation_list.append( 2944 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2945 ) 2946 if param.get("annotation_snpeff", None) != None: 2947 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2948 if param.get("annotation_bcftools", None) != None: 2949 if isinstance(param.get("annotation_bcftools", None), list): 2950 param_annotation_list.append( 2951 "bcftools:" 2952 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2953 ) 2954 else: 2955 param_annotation_list.append( 2956 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2957 ) 2958 if param.get("annotation_annovar", None) != None: 2959 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2960 if param.get("annotation_exomiser", None) != None: 2961 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2962 if param.get("annotation_splice", None) != None: 2963 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2964 2965 # Merge param annotations list 2966 param["annotations"] = ",".join(param_annotation_list) 2967 2968 # debug 2969 log.debug(f"param_annotations={param['annotations']}") 2970 2971 if param.get("annotations"): 2972 2973 # Log 2974 # log.info("Annotations - Check annotation parameters") 2975 2976 if not "annotation" in param: 2977 param["annotation"] = {} 2978 2979 # List of annotations parameters 2980 annotations_list_input = {} 2981 if isinstance(param.get("annotations", None), str): 2982 annotation_file_list = [ 2983 value for value in param.get("annotations", "").split(",") 2984 ] 2985 for annotation_file in annotation_file_list: 2986 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2987 else: 2988 annotations_list_input = param.get("annotations", {}) 2989 2990 log.info(f"Quick Annotations:") 2991 for annotation_key in list(annotations_list_input.keys()): 2992 log.info(f" {annotation_key}") 2993 2994 # List of annotations and associated fields 2995 annotations_list = {} 2996 2997 for annotation_file in annotations_list_input: 2998 2999 # Explode annotations if ALL 3000 if ( 3001 annotation_file.upper() == "ALL" 3002 or annotation_file.upper().startswith("ALL:") 3003 ): 3004 3005 # check ALL parameters (formats, releases) 3006 annotation_file_split = annotation_file.split(":") 3007 database_formats = "parquet" 3008 database_releases = "current" 3009 for annotation_file_option in annotation_file_split[1:]: 3010 database_all_options_split = annotation_file_option.split("=") 3011 if database_all_options_split[0] == "format": 3012 database_formats = database_all_options_split[1].split("+") 3013 if database_all_options_split[0] == "release": 3014 database_releases = database_all_options_split[1].split("+") 3015 3016 # Scan for availabled databases 3017 databases_infos_dict = self.scan_databases( 3018 database_formats=database_formats, 3019 database_releases=database_releases, 3020 ) 3021 3022 # Add found databases in annotation parameters 3023 for database_infos in databases_infos_dict.keys(): 3024 annotations_list[database_infos] = {"INFO": None} 3025 3026 else: 3027 annotations_list[annotation_file] = annotations_list_input[ 3028 annotation_file 3029 ] 3030 3031 # Check each databases 3032 if len(annotations_list): 3033 3034 log.info( 3035 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3036 ) 3037 3038 for annotation_file in annotations_list: 3039 3040 # Init 3041 annotations = annotations_list.get(annotation_file, None) 3042 3043 # Annotation snpEff 3044 if annotation_file.startswith("snpeff"): 3045 3046 log.debug(f"Quick Annotation snpEff") 3047 3048 if "snpeff" not in param["annotation"]: 3049 param["annotation"]["snpeff"] = {} 3050 3051 if "options" not in param["annotation"]["snpeff"]: 3052 param["annotation"]["snpeff"]["options"] = "" 3053 3054 # snpEff options in annotations 3055 param["annotation"]["snpeff"]["options"] = "".join( 3056 annotation_file.split(":")[1:] 3057 ) 3058 3059 # Annotation Annovar 3060 elif annotation_file.startswith("annovar"): 3061 3062 log.debug(f"Quick Annotation Annovar") 3063 3064 if "annovar" not in param["annotation"]: 3065 param["annotation"]["annovar"] = {} 3066 3067 if "annotations" not in param["annotation"]["annovar"]: 3068 param["annotation"]["annovar"]["annotations"] = {} 3069 3070 # Options 3071 annotation_file_split = annotation_file.split(":") 3072 for annotation_file_annotation in annotation_file_split[1:]: 3073 if annotation_file_annotation: 3074 param["annotation"]["annovar"]["annotations"][ 3075 annotation_file_annotation 3076 ] = annotations 3077 3078 # Annotation Exomiser 3079 elif annotation_file.startswith("exomiser"): 3080 3081 log.debug(f"Quick Annotation Exomiser") 3082 3083 param["annotation"]["exomiser"] = params_string_to_dict( 3084 annotation_file 3085 ) 3086 3087 # Annotation Splice 3088 elif annotation_file.startswith("splice"): 3089 3090 log.debug(f"Quick Annotation Splice") 3091 3092 param["annotation"]["splice"] = params_string_to_dict( 3093 annotation_file 3094 ) 3095 3096 # Annotation Parquet or BCFTOOLS 3097 else: 3098 3099 # Tools detection 3100 if annotation_file.startswith("bcftools:"): 3101 annotation_tool_initial = "bcftools" 3102 annotation_file = ":".join(annotation_file.split(":")[1:]) 3103 elif annotation_file.startswith("snpsift:"): 3104 annotation_tool_initial = "snpsift" 3105 annotation_file = ":".join(annotation_file.split(":")[1:]) 3106 elif annotation_file.startswith("bigwig:"): 3107 annotation_tool_initial = "bigwig" 3108 annotation_file = ":".join(annotation_file.split(":")[1:]) 3109 else: 3110 annotation_tool_initial = None 3111 3112 # list of files 3113 annotation_file_list = annotation_file.replace("+", ":").split( 3114 ":" 3115 ) 3116 3117 for annotation_file in annotation_file_list: 3118 3119 if annotation_file: 3120 3121 # Annotation tool initial 3122 annotation_tool = annotation_tool_initial 3123 3124 # Find file 3125 annotation_file_found = None 3126 3127 if os.path.exists(annotation_file): 3128 annotation_file_found = annotation_file 3129 elif os.path.exists(full_path(annotation_file)): 3130 annotation_file_found = full_path(annotation_file) 3131 else: 3132 # Find within assembly folders 3133 for annotations_database in annotations_databases: 3134 found_files = find_all( 3135 annotation_file, 3136 os.path.join( 3137 annotations_database, assembly 3138 ), 3139 ) 3140 if len(found_files) > 0: 3141 annotation_file_found = found_files[0] 3142 break 3143 if not annotation_file_found and not assembly: 3144 # Find within folders 3145 for ( 3146 annotations_database 3147 ) in annotations_databases: 3148 found_files = find_all( 3149 annotation_file, annotations_database 3150 ) 3151 if len(found_files) > 0: 3152 annotation_file_found = found_files[0] 3153 break 3154 log.debug( 3155 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3156 ) 3157 3158 # Full path 3159 annotation_file_found = full_path(annotation_file_found) 3160 3161 if annotation_file_found: 3162 3163 database = Database(database=annotation_file_found) 3164 quick_annotation_format = database.get_format() 3165 quick_annotation_is_compressed = ( 3166 database.is_compressed() 3167 ) 3168 quick_annotation_is_indexed = os.path.exists( 3169 f"{annotation_file_found}.tbi" 3170 ) 3171 bcftools_preference = False 3172 3173 # Check Annotation Tool 3174 if not annotation_tool: 3175 if ( 3176 bcftools_preference 3177 and quick_annotation_format 3178 in ["vcf", "bed"] 3179 and quick_annotation_is_compressed 3180 and quick_annotation_is_indexed 3181 ): 3182 annotation_tool = "bcftools" 3183 elif quick_annotation_format in [ 3184 "vcf", 3185 "bed", 3186 "tsv", 3187 "tsv", 3188 "csv", 3189 "json", 3190 "tbl", 3191 "parquet", 3192 "duckdb", 3193 ]: 3194 annotation_tool = "parquet" 3195 elif quick_annotation_format in ["bw"]: 3196 annotation_tool = "bigwig" 3197 else: 3198 log.error( 3199 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3200 ) 3201 raise ValueError( 3202 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3203 ) 3204 3205 log.debug( 3206 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3207 ) 3208 3209 # Annotation Tool dispatch 3210 if annotation_tool: 3211 if annotation_tool not in param["annotation"]: 3212 param["annotation"][annotation_tool] = {} 3213 if ( 3214 "annotations" 3215 not in param["annotation"][annotation_tool] 3216 ): 3217 param["annotation"][annotation_tool][ 3218 "annotations" 3219 ] = {} 3220 param["annotation"][annotation_tool][ 3221 "annotations" 3222 ][annotation_file_found] = annotations 3223 3224 else: 3225 log.warning( 3226 f"Quick Annotation File {annotation_file} does NOT exist" 3227 ) 3228 3229 self.set_param(param) 3230 3231 if param.get("annotation", None): 3232 log.info("Annotations") 3233 if param.get("annotation", {}).get("parquet", None): 3234 log.info("Annotations 'parquet'...") 3235 self.annotation_parquet() 3236 if param.get("annotation", {}).get("bcftools", None): 3237 log.info("Annotations 'bcftools'...") 3238 self.annotation_bcftools() 3239 if param.get("annotation", {}).get("snpsift", None): 3240 log.info("Annotations 'snpsift'...") 3241 self.annotation_snpsift() 3242 if param.get("annotation", {}).get("bigwig", None): 3243 log.info("Annotations 'bigwig'...") 3244 self.annotation_bigwig() 3245 if param.get("annotation", {}).get("annovar", None): 3246 log.info("Annotations 'annovar'...") 3247 self.annotation_annovar() 3248 if param.get("annotation", {}).get("snpeff", None): 3249 log.info("Annotations 'snpeff'...") 3250 self.annotation_snpeff() 3251 if param.get("annotation", {}).get("exomiser", None) is not None: 3252 log.info("Annotations 'exomiser'...") 3253 self.annotation_exomiser() 3254 if param.get("annotation", {}).get("splice", None) is not None: 3255 log.info("Annotations 'splice' ...") 3256 self.annotation_splice() 3257 3258 # Explode INFOS fields into table fields 3259 if self.get_explode_infos(): 3260 self.explode_infos( 3261 prefix=self.get_explode_infos_prefix(), 3262 fields=self.get_explode_infos_fields(), 3263 force=True, 3264 ) 3265 3266 def annotation_bigwig(self, threads: int = None) -> None: 3267 """ 3268 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3269 3270 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3271 number of threads to be used for parallel processing during the annotation process. If the 3272 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3273 threads to use based on the system configuration 3274 :type threads: int 3275 :return: True 3276 """ 3277 3278 # DEBUG 3279 log.debug("Start annotation with bigwig databases") 3280 3281 # # Threads 3282 # if not threads: 3283 # threads = self.get_threads() 3284 # log.debug("Threads: " + str(threads)) 3285 3286 # Config 3287 config = self.get_config() 3288 log.debug("Config: " + str(config)) 3289 3290 # Config - BCFTools databases folders 3291 databases_folders = set( 3292 self.get_config() 3293 .get("folders", {}) 3294 .get("databases", {}) 3295 .get("annotations", ["."]) 3296 + self.get_config() 3297 .get("folders", {}) 3298 .get("databases", {}) 3299 .get("bigwig", ["."]) 3300 ) 3301 log.debug("Databases annotations: " + str(databases_folders)) 3302 3303 # Param 3304 annotations = ( 3305 self.get_param() 3306 .get("annotation", {}) 3307 .get("bigwig", {}) 3308 .get("annotations", None) 3309 ) 3310 log.debug("Annotations: " + str(annotations)) 3311 3312 # Assembly 3313 assembly = self.get_param().get( 3314 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3315 ) 3316 3317 # Data 3318 table_variants = self.get_table_variants() 3319 3320 # Check if not empty 3321 log.debug("Check if not empty") 3322 sql_query_chromosomes = ( 3323 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3324 ) 3325 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3326 if not sql_query_chromosomes_df["count"][0]: 3327 log.info(f"VCF empty") 3328 return 3329 3330 # VCF header 3331 vcf_reader = self.get_header() 3332 log.debug("Initial header: " + str(vcf_reader.infos)) 3333 3334 # Existing annotations 3335 for vcf_annotation in self.get_header().infos: 3336 3337 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3338 log.debug( 3339 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3340 ) 3341 3342 if annotations: 3343 3344 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3345 3346 # Export VCF file 3347 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3348 3349 # annotation_bigwig_config 3350 annotation_bigwig_config_list = [] 3351 3352 for annotation in annotations: 3353 annotation_fields = annotations[annotation] 3354 3355 # Annotation Name 3356 annotation_name = os.path.basename(annotation) 3357 3358 if not annotation_fields: 3359 annotation_fields = {"INFO": None} 3360 3361 log.debug(f"Annotation '{annotation_name}'") 3362 log.debug( 3363 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3364 ) 3365 3366 # Create Database 3367 database = Database( 3368 database=annotation, 3369 databases_folders=databases_folders, 3370 assembly=assembly, 3371 ) 3372 3373 # Find files 3374 db_file = database.get_database() 3375 db_file = full_path(db_file) 3376 db_hdr_file = database.get_header_file() 3377 db_hdr_file = full_path(db_hdr_file) 3378 db_file_type = database.get_format() 3379 3380 # If db_file is http ? 3381 if database.get_database().startswith("http"): 3382 3383 # Datbase is HTTP URL 3384 db_file_is_http = True 3385 3386 # DB file keep as URL 3387 db_file = database.get_database() 3388 log.warning( 3389 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3390 ) 3391 3392 # Retrieve automatic annotation field name 3393 annotation_field = clean_annotation_field( 3394 os.path.basename(db_file).replace(".bw", "") 3395 ) 3396 log.debug( 3397 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3398 ) 3399 3400 # Create automatic header file 3401 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3402 with open(db_hdr_file, "w") as f: 3403 f.write("##fileformat=VCFv4.2\n") 3404 f.write( 3405 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3406 ) 3407 f.write(f"#CHROM START END {annotation_field}\n") 3408 3409 else: 3410 3411 # Datbase is NOT HTTP URL 3412 db_file_is_http = False 3413 3414 # Check index - try to create if not exists 3415 if ( 3416 db_file is None 3417 or db_hdr_file is None 3418 or (not os.path.exists(db_file) and not db_file_is_http) 3419 or not os.path.exists(db_hdr_file) 3420 or not db_file_type in ["bw"] 3421 ): 3422 # if False: 3423 log.error("Annotation failed: database not valid") 3424 log.error(f"Annotation annotation file: {db_file}") 3425 log.error(f"Annotation annotation file type: {db_file_type}") 3426 log.error(f"Annotation annotation header: {db_hdr_file}") 3427 raise ValueError( 3428 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3429 ) 3430 else: 3431 3432 # Log 3433 log.debug( 3434 f"Annotation '{annotation}' - file: " 3435 + str(db_file) 3436 + " and " 3437 + str(db_hdr_file) 3438 ) 3439 3440 # Load header as VCF object 3441 db_hdr_vcf = Variants(input=db_hdr_file) 3442 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3443 log.debug( 3444 "Annotation database header: " 3445 + str(db_hdr_vcf_header_infos) 3446 ) 3447 3448 # For all fields in database 3449 annotation_fields_full = False 3450 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3451 annotation_fields = { 3452 key: key for key in db_hdr_vcf_header_infos 3453 } 3454 log.debug( 3455 "Annotation database header - All annotations added: " 3456 + str(annotation_fields) 3457 ) 3458 annotation_fields_full = True 3459 3460 # Init 3461 cyvcf2_header_rename_dict = {} 3462 cyvcf2_header_list = [] 3463 cyvcf2_header_indexes = {} 3464 3465 # process annotation fields 3466 for annotation_field in annotation_fields: 3467 3468 # New annotation name 3469 annotation_field_new = annotation_fields[annotation_field] 3470 3471 # Check annotation field and index in header 3472 if ( 3473 annotation_field 3474 in db_hdr_vcf.get_header_columns_as_list() 3475 ): 3476 annotation_field_index = ( 3477 db_hdr_vcf.get_header_columns_as_list().index( 3478 annotation_field 3479 ) 3480 - 3 3481 ) 3482 cyvcf2_header_indexes[annotation_field_new] = ( 3483 annotation_field_index 3484 ) 3485 else: 3486 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3487 log.error(msg_err) 3488 raise ValueError(msg_err) 3489 3490 # Append annotation field in cyvcf2 header list 3491 cyvcf2_header_rename_dict[annotation_field_new] = ( 3492 db_hdr_vcf_header_infos[annotation_field].id 3493 ) 3494 cyvcf2_header_list.append( 3495 { 3496 "ID": annotation_field_new, 3497 "Number": db_hdr_vcf_header_infos[ 3498 annotation_field 3499 ].num, 3500 "Type": db_hdr_vcf_header_infos[ 3501 annotation_field 3502 ].type, 3503 "Description": db_hdr_vcf_header_infos[ 3504 annotation_field 3505 ].desc, 3506 } 3507 ) 3508 3509 # Add header on VCF 3510 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3511 annotation_field_new, 3512 db_hdr_vcf_header_infos[annotation_field].num, 3513 db_hdr_vcf_header_infos[annotation_field].type, 3514 db_hdr_vcf_header_infos[annotation_field].desc, 3515 "HOWARD BigWig annotation", 3516 "unknown", 3517 self.code_type_map[ 3518 db_hdr_vcf_header_infos[annotation_field].type 3519 ], 3520 ) 3521 3522 # Load bigwig database 3523 bw_db = pyBigWig.open(db_file) 3524 if bw_db.isBigWig(): 3525 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3526 else: 3527 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3528 log.error(msg_err) 3529 raise ValueError(msg_err) 3530 3531 annotation_bigwig_config_list.append( 3532 { 3533 "db_file": db_file, 3534 "bw_db": bw_db, 3535 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3536 "cyvcf2_header_list": cyvcf2_header_list, 3537 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3538 } 3539 ) 3540 3541 # Annotate 3542 if annotation_bigwig_config_list: 3543 3544 # Annotation config 3545 log.debug( 3546 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3547 ) 3548 3549 # Export VCF file 3550 self.export_variant_vcf( 3551 vcf_file=tmp_vcf_name, 3552 remove_info=True, 3553 add_samples=False, 3554 index=True, 3555 ) 3556 3557 # Load input tmp file 3558 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3559 3560 # Add header in input file 3561 for annotation_bigwig_config in annotation_bigwig_config_list: 3562 for cyvcf2_header_field in annotation_bigwig_config.get( 3563 "cyvcf2_header_list", [] 3564 ): 3565 log.info( 3566 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3567 ) 3568 input_vcf.add_info_to_header(cyvcf2_header_field) 3569 3570 # Create output VCF file 3571 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3572 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3573 3574 # Fetch variants 3575 log.info(f"Annotations 'bigwig' start...") 3576 for variant in input_vcf: 3577 3578 for annotation_bigwig_config in annotation_bigwig_config_list: 3579 3580 # DB and indexes 3581 bw_db = annotation_bigwig_config.get("bw_db", None) 3582 cyvcf2_header_indexes = annotation_bigwig_config.get( 3583 "cyvcf2_header_indexes", None 3584 ) 3585 3586 # Retrieve value from chrom pos 3587 res = bw_db.values( 3588 variant.CHROM, variant.POS - 1, variant.POS 3589 ) 3590 3591 # For each annotation fields (and indexes) 3592 for cyvcf2_header_index in cyvcf2_header_indexes: 3593 3594 # If value is NOT nNone 3595 if not np.isnan( 3596 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3597 ): 3598 variant.INFO[cyvcf2_header_index] = res[ 3599 cyvcf2_header_indexes[cyvcf2_header_index] 3600 ] 3601 3602 # Add record in output file 3603 output_vcf.write_record(variant) 3604 3605 # Log 3606 log.debug(f"Annotation done.") 3607 3608 # Close and write file 3609 log.info(f"Annotations 'bigwig' write...") 3610 output_vcf.close() 3611 log.debug(f"Write done.") 3612 3613 # Update variants 3614 log.info(f"Annotations 'bigwig' update...") 3615 self.update_from_vcf(output_vcf_file) 3616 log.debug(f"Update done.") 3617 3618 return True 3619 3620 def annotation_snpsift(self, threads: int = None) -> None: 3621 """ 3622 This function annotate with bcftools 3623 3624 :param threads: Number of threads to use 3625 :return: the value of the variable "return_value". 3626 """ 3627 3628 # DEBUG 3629 log.debug("Start annotation with bcftools databases") 3630 3631 # Threads 3632 if not threads: 3633 threads = self.get_threads() 3634 log.debug("Threads: " + str(threads)) 3635 3636 # Config 3637 config = self.get_config() 3638 log.debug("Config: " + str(config)) 3639 3640 # Config - snpSift 3641 snpsift_bin_command = get_bin_command( 3642 bin="SnpSift.jar", 3643 tool="snpsift", 3644 bin_type="jar", 3645 config=config, 3646 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3647 ) 3648 if not snpsift_bin_command: 3649 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3650 log.error(msg_err) 3651 raise ValueError(msg_err) 3652 3653 # Config - bcftools 3654 bcftools_bin_command = get_bin_command( 3655 bin="bcftools", 3656 tool="bcftools", 3657 bin_type="bin", 3658 config=config, 3659 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3660 ) 3661 if not bcftools_bin_command: 3662 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3663 log.error(msg_err) 3664 raise ValueError(msg_err) 3665 3666 # Config - BCFTools databases folders 3667 databases_folders = set( 3668 self.get_config() 3669 .get("folders", {}) 3670 .get("databases", {}) 3671 .get("annotations", ["."]) 3672 + self.get_config() 3673 .get("folders", {}) 3674 .get("databases", {}) 3675 .get("bcftools", ["."]) 3676 ) 3677 log.debug("Databases annotations: " + str(databases_folders)) 3678 3679 # Param 3680 annotations = ( 3681 self.get_param() 3682 .get("annotation", {}) 3683 .get("snpsift", {}) 3684 .get("annotations", None) 3685 ) 3686 log.debug("Annotations: " + str(annotations)) 3687 3688 # Assembly 3689 assembly = self.get_param().get( 3690 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3691 ) 3692 3693 # Data 3694 table_variants = self.get_table_variants() 3695 3696 # Check if not empty 3697 log.debug("Check if not empty") 3698 sql_query_chromosomes = ( 3699 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3700 ) 3701 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3702 if not sql_query_chromosomes_df["count"][0]: 3703 log.info(f"VCF empty") 3704 return 3705 3706 # VCF header 3707 vcf_reader = self.get_header() 3708 log.debug("Initial header: " + str(vcf_reader.infos)) 3709 3710 # Existing annotations 3711 for vcf_annotation in self.get_header().infos: 3712 3713 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3714 log.debug( 3715 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3716 ) 3717 3718 if annotations: 3719 3720 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3721 3722 # Export VCF file 3723 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3724 3725 # Init 3726 commands = {} 3727 3728 for annotation in annotations: 3729 annotation_fields = annotations[annotation] 3730 3731 # Annotation Name 3732 annotation_name = os.path.basename(annotation) 3733 3734 if not annotation_fields: 3735 annotation_fields = {"INFO": None} 3736 3737 log.debug(f"Annotation '{annotation_name}'") 3738 log.debug( 3739 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3740 ) 3741 3742 # Create Database 3743 database = Database( 3744 database=annotation, 3745 databases_folders=databases_folders, 3746 assembly=assembly, 3747 ) 3748 3749 # Find files 3750 db_file = database.get_database() 3751 db_file = full_path(db_file) 3752 db_hdr_file = database.get_header_file() 3753 db_hdr_file = full_path(db_hdr_file) 3754 db_file_type = database.get_format() 3755 db_tbi_file = f"{db_file}.tbi" 3756 db_file_compressed = database.is_compressed() 3757 3758 # Check if compressed 3759 if not db_file_compressed: 3760 log.error( 3761 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3762 ) 3763 raise ValueError( 3764 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3765 ) 3766 3767 # Check if indexed 3768 if not os.path.exists(db_tbi_file): 3769 log.error( 3770 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3771 ) 3772 raise ValueError( 3773 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3774 ) 3775 3776 # Check index - try to create if not exists 3777 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3778 log.error("Annotation failed: database not valid") 3779 log.error(f"Annotation annotation file: {db_file}") 3780 log.error(f"Annotation annotation header: {db_hdr_file}") 3781 log.error(f"Annotation annotation index: {db_tbi_file}") 3782 raise ValueError( 3783 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3784 ) 3785 else: 3786 3787 log.debug( 3788 f"Annotation '{annotation}' - file: " 3789 + str(db_file) 3790 + " and " 3791 + str(db_hdr_file) 3792 ) 3793 3794 # Load header as VCF object 3795 db_hdr_vcf = Variants(input=db_hdr_file) 3796 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3797 log.debug( 3798 "Annotation database header: " 3799 + str(db_hdr_vcf_header_infos) 3800 ) 3801 3802 # For all fields in database 3803 annotation_fields_full = False 3804 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3805 annotation_fields = { 3806 key: key for key in db_hdr_vcf_header_infos 3807 } 3808 log.debug( 3809 "Annotation database header - All annotations added: " 3810 + str(annotation_fields) 3811 ) 3812 annotation_fields_full = True 3813 3814 # # Create file for field rename 3815 # log.debug("Create file for field rename") 3816 # tmp_rename = NamedTemporaryFile( 3817 # prefix=self.get_prefix(), 3818 # dir=self.get_tmp_dir(), 3819 # suffix=".rename", 3820 # delete=False, 3821 # ) 3822 # tmp_rename_name = tmp_rename.name 3823 # tmp_files.append(tmp_rename_name) 3824 3825 # Number of fields 3826 nb_annotation_field = 0 3827 annotation_list = [] 3828 annotation_infos_rename_list = [] 3829 3830 for annotation_field in annotation_fields: 3831 3832 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3833 annotation_fields_new_name = annotation_fields.get( 3834 annotation_field, annotation_field 3835 ) 3836 if not annotation_fields_new_name: 3837 annotation_fields_new_name = annotation_field 3838 3839 # Check if field is in DB and if field is not elready in input data 3840 if ( 3841 annotation_field in db_hdr_vcf.get_header().infos 3842 and annotation_fields_new_name 3843 not in self.get_header().infos 3844 ): 3845 3846 log.info( 3847 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3848 ) 3849 3850 # BCFTools annotate param to rename fields 3851 if annotation_field != annotation_fields_new_name: 3852 annotation_infos_rename_list.append( 3853 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3854 ) 3855 3856 # Add INFO field to header 3857 db_hdr_vcf_header_infos_number = ( 3858 db_hdr_vcf_header_infos[annotation_field].num or "." 3859 ) 3860 db_hdr_vcf_header_infos_type = ( 3861 db_hdr_vcf_header_infos[annotation_field].type 3862 or "String" 3863 ) 3864 db_hdr_vcf_header_infos_description = ( 3865 db_hdr_vcf_header_infos[annotation_field].desc 3866 or f"{annotation_field} description" 3867 ) 3868 db_hdr_vcf_header_infos_source = ( 3869 db_hdr_vcf_header_infos[annotation_field].source 3870 or "unknown" 3871 ) 3872 db_hdr_vcf_header_infos_version = ( 3873 db_hdr_vcf_header_infos[annotation_field].version 3874 or "unknown" 3875 ) 3876 3877 vcf_reader.infos[annotation_fields_new_name] = ( 3878 vcf.parser._Info( 3879 annotation_fields_new_name, 3880 db_hdr_vcf_header_infos_number, 3881 db_hdr_vcf_header_infos_type, 3882 db_hdr_vcf_header_infos_description, 3883 db_hdr_vcf_header_infos_source, 3884 db_hdr_vcf_header_infos_version, 3885 self.code_type_map[ 3886 db_hdr_vcf_header_infos_type 3887 ], 3888 ) 3889 ) 3890 3891 annotation_list.append(annotation_field) 3892 3893 nb_annotation_field += 1 3894 3895 else: 3896 3897 if ( 3898 annotation_field 3899 not in db_hdr_vcf.get_header().infos 3900 ): 3901 log.warning( 3902 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3903 ) 3904 if ( 3905 annotation_fields_new_name 3906 in self.get_header().infos 3907 ): 3908 log.warning( 3909 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3910 ) 3911 3912 log.info( 3913 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3914 ) 3915 3916 annotation_infos = ",".join(annotation_list) 3917 3918 if annotation_infos != "": 3919 3920 # Annotated VCF (and error file) 3921 tmp_annotation_vcf_name = os.path.join( 3922 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3923 ) 3924 tmp_annotation_vcf_name_err = ( 3925 tmp_annotation_vcf_name + ".err" 3926 ) 3927 3928 # Add fields to annotate 3929 if not annotation_fields_full: 3930 annotation_infos_option = f"-info {annotation_infos}" 3931 else: 3932 annotation_infos_option = "" 3933 3934 # Info fields rename 3935 if annotation_infos_rename_list: 3936 annotation_infos_rename = " -c " + ",".join( 3937 annotation_infos_rename_list 3938 ) 3939 else: 3940 annotation_infos_rename = "" 3941 3942 # Annotate command 3943 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3944 3945 # Add command 3946 commands[command_annotate] = tmp_annotation_vcf_name 3947 3948 if commands: 3949 3950 # Export VCF file 3951 self.export_variant_vcf( 3952 vcf_file=tmp_vcf_name, 3953 remove_info=True, 3954 add_samples=False, 3955 index=True, 3956 ) 3957 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3958 3959 # Num command 3960 nb_command = 0 3961 3962 # Annotate 3963 for command_annotate in commands: 3964 nb_command += 1 3965 log.info( 3966 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3967 ) 3968 log.debug(f"command_annotate={command_annotate}") 3969 run_parallel_commands([command_annotate], threads) 3970 3971 # Debug 3972 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3973 3974 # Update variants 3975 log.info( 3976 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3977 ) 3978 self.update_from_vcf(commands[command_annotate]) 3979 3980 def annotation_bcftools(self, threads: int = None) -> None: 3981 """ 3982 This function annotate with bcftools 3983 3984 :param threads: Number of threads to use 3985 :return: the value of the variable "return_value". 3986 """ 3987 3988 # DEBUG 3989 log.debug("Start annotation with bcftools databases") 3990 3991 # Threads 3992 if not threads: 3993 threads = self.get_threads() 3994 log.debug("Threads: " + str(threads)) 3995 3996 # Config 3997 config = self.get_config() 3998 log.debug("Config: " + str(config)) 3999 4000 # DEBUG 4001 delete_tmp = True 4002 if self.get_config().get("verbosity", "warning") in ["debug"]: 4003 delete_tmp = False 4004 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4005 4006 # Config - BCFTools bin command 4007 bcftools_bin_command = get_bin_command( 4008 bin="bcftools", 4009 tool="bcftools", 4010 bin_type="bin", 4011 config=config, 4012 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4013 ) 4014 if not bcftools_bin_command: 4015 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4016 log.error(msg_err) 4017 raise ValueError(msg_err) 4018 4019 # Config - BCFTools databases folders 4020 databases_folders = set( 4021 self.get_config() 4022 .get("folders", {}) 4023 .get("databases", {}) 4024 .get("annotations", ["."]) 4025 + self.get_config() 4026 .get("folders", {}) 4027 .get("databases", {}) 4028 .get("bcftools", ["."]) 4029 ) 4030 log.debug("Databases annotations: " + str(databases_folders)) 4031 4032 # Param 4033 annotations = ( 4034 self.get_param() 4035 .get("annotation", {}) 4036 .get("bcftools", {}) 4037 .get("annotations", None) 4038 ) 4039 log.debug("Annotations: " + str(annotations)) 4040 4041 # Assembly 4042 assembly = self.get_param().get( 4043 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4044 ) 4045 4046 # Data 4047 table_variants = self.get_table_variants() 4048 4049 # Check if not empty 4050 log.debug("Check if not empty") 4051 sql_query_chromosomes = ( 4052 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4053 ) 4054 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4055 if not sql_query_chromosomes_df["count"][0]: 4056 log.info(f"VCF empty") 4057 return 4058 4059 # Export in VCF 4060 log.debug("Create initial file to annotate") 4061 tmp_vcf = NamedTemporaryFile( 4062 prefix=self.get_prefix(), 4063 dir=self.get_tmp_dir(), 4064 suffix=".vcf.gz", 4065 delete=False, 4066 ) 4067 tmp_vcf_name = tmp_vcf.name 4068 4069 # VCF header 4070 vcf_reader = self.get_header() 4071 log.debug("Initial header: " + str(vcf_reader.infos)) 4072 4073 # Existing annotations 4074 for vcf_annotation in self.get_header().infos: 4075 4076 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4077 log.debug( 4078 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4079 ) 4080 4081 if annotations: 4082 4083 tmp_ann_vcf_list = [] 4084 commands = [] 4085 tmp_files = [] 4086 err_files = [] 4087 4088 for annotation in annotations: 4089 annotation_fields = annotations[annotation] 4090 4091 # Annotation Name 4092 annotation_name = os.path.basename(annotation) 4093 4094 if not annotation_fields: 4095 annotation_fields = {"INFO": None} 4096 4097 log.debug(f"Annotation '{annotation_name}'") 4098 log.debug( 4099 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4100 ) 4101 4102 # Create Database 4103 database = Database( 4104 database=annotation, 4105 databases_folders=databases_folders, 4106 assembly=assembly, 4107 ) 4108 4109 # Find files 4110 db_file = database.get_database() 4111 db_file = full_path(db_file) 4112 db_hdr_file = database.get_header_file() 4113 db_hdr_file = full_path(db_hdr_file) 4114 db_file_type = database.get_format() 4115 db_tbi_file = f"{db_file}.tbi" 4116 db_file_compressed = database.is_compressed() 4117 4118 # Check if compressed 4119 if not db_file_compressed: 4120 log.error( 4121 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4122 ) 4123 raise ValueError( 4124 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4125 ) 4126 4127 # Check if indexed 4128 if not os.path.exists(db_tbi_file): 4129 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4130 raise ValueError( 4131 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4132 ) 4133 4134 # Check index - try to create if not exists 4135 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4136 log.error("Annotation failed: database not valid") 4137 log.error(f"Annotation annotation file: {db_file}") 4138 log.error(f"Annotation annotation header: {db_hdr_file}") 4139 log.error(f"Annotation annotation index: {db_tbi_file}") 4140 raise ValueError( 4141 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4142 ) 4143 else: 4144 4145 log.debug( 4146 f"Annotation '{annotation}' - file: " 4147 + str(db_file) 4148 + " and " 4149 + str(db_hdr_file) 4150 ) 4151 4152 # Load header as VCF object 4153 db_hdr_vcf = Variants(input=db_hdr_file) 4154 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4155 log.debug( 4156 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4157 ) 4158 4159 # For all fields in database 4160 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4161 annotation_fields = { 4162 key: key for key in db_hdr_vcf_header_infos 4163 } 4164 log.debug( 4165 "Annotation database header - All annotations added: " 4166 + str(annotation_fields) 4167 ) 4168 4169 # Number of fields 4170 nb_annotation_field = 0 4171 annotation_list = [] 4172 4173 for annotation_field in annotation_fields: 4174 4175 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4176 annotation_fields_new_name = annotation_fields.get( 4177 annotation_field, annotation_field 4178 ) 4179 if not annotation_fields_new_name: 4180 annotation_fields_new_name = annotation_field 4181 4182 # Check if field is in DB and if field is not elready in input data 4183 if ( 4184 annotation_field in db_hdr_vcf.get_header().infos 4185 and annotation_fields_new_name 4186 not in self.get_header().infos 4187 ): 4188 4189 log.info( 4190 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4191 ) 4192 4193 # Add INFO field to header 4194 db_hdr_vcf_header_infos_number = ( 4195 db_hdr_vcf_header_infos[annotation_field].num or "." 4196 ) 4197 db_hdr_vcf_header_infos_type = ( 4198 db_hdr_vcf_header_infos[annotation_field].type 4199 or "String" 4200 ) 4201 db_hdr_vcf_header_infos_description = ( 4202 db_hdr_vcf_header_infos[annotation_field].desc 4203 or f"{annotation_field} description" 4204 ) 4205 db_hdr_vcf_header_infos_source = ( 4206 db_hdr_vcf_header_infos[annotation_field].source 4207 or "unknown" 4208 ) 4209 db_hdr_vcf_header_infos_version = ( 4210 db_hdr_vcf_header_infos[annotation_field].version 4211 or "unknown" 4212 ) 4213 4214 vcf_reader.infos[annotation_fields_new_name] = ( 4215 vcf.parser._Info( 4216 annotation_fields_new_name, 4217 db_hdr_vcf_header_infos_number, 4218 db_hdr_vcf_header_infos_type, 4219 db_hdr_vcf_header_infos_description, 4220 db_hdr_vcf_header_infos_source, 4221 db_hdr_vcf_header_infos_version, 4222 self.code_type_map[db_hdr_vcf_header_infos_type], 4223 ) 4224 ) 4225 4226 # annotation_list.append(annotation_field) 4227 if annotation_field != annotation_fields_new_name: 4228 annotation_list.append( 4229 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4230 ) 4231 else: 4232 annotation_list.append(annotation_field) 4233 4234 nb_annotation_field += 1 4235 4236 else: 4237 4238 if annotation_field not in db_hdr_vcf.get_header().infos: 4239 log.warning( 4240 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4241 ) 4242 if annotation_fields_new_name in self.get_header().infos: 4243 log.warning( 4244 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4245 ) 4246 4247 log.info( 4248 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4249 ) 4250 4251 annotation_infos = ",".join(annotation_list) 4252 4253 if annotation_infos != "": 4254 4255 # Protect header for bcftools (remove "#CHROM" and variants line) 4256 log.debug("Protect Header file - remove #CHROM line if exists") 4257 tmp_header_vcf = NamedTemporaryFile( 4258 prefix=self.get_prefix(), 4259 dir=self.get_tmp_dir(), 4260 suffix=".hdr", 4261 delete=False, 4262 ) 4263 tmp_header_vcf_name = tmp_header_vcf.name 4264 tmp_files.append(tmp_header_vcf_name) 4265 # Command 4266 if db_hdr_file.endswith(".gz"): 4267 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4268 else: 4269 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4270 # Run 4271 run_parallel_commands([command_extract_header], 1) 4272 4273 # Find chomosomes 4274 log.debug("Find chromosomes ") 4275 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4276 sql_query_chromosomes_df = self.get_query_to_df( 4277 sql_query_chromosomes 4278 ) 4279 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4280 4281 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4282 4283 # BED columns in the annotation file 4284 if db_file_type in ["bed"]: 4285 annotation_infos = "CHROM,POS,POS," + annotation_infos 4286 4287 for chrom in chomosomes_list: 4288 4289 # Create BED on initial VCF 4290 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4291 tmp_bed = NamedTemporaryFile( 4292 prefix=self.get_prefix(), 4293 dir=self.get_tmp_dir(), 4294 suffix=".bed", 4295 delete=False, 4296 ) 4297 tmp_bed_name = tmp_bed.name 4298 tmp_files.append(tmp_bed_name) 4299 4300 # Detecte regions 4301 log.debug( 4302 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4303 ) 4304 window = 1000000 4305 sql_query_intervals_for_bed = f""" 4306 SELECT \"#CHROM\", 4307 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4308 \"POS\"+{window} 4309 FROM {table_variants} as table_variants 4310 WHERE table_variants.\"#CHROM\" = '{chrom}' 4311 """ 4312 regions = self.conn.execute( 4313 sql_query_intervals_for_bed 4314 ).fetchall() 4315 merged_regions = merge_regions(regions) 4316 log.debug( 4317 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4318 ) 4319 4320 header = ["#CHROM", "START", "END"] 4321 with open(tmp_bed_name, "w") as f: 4322 # Write the header with tab delimiter 4323 f.write("\t".join(header) + "\n") 4324 for d in merged_regions: 4325 # Write each data row with tab delimiter 4326 f.write("\t".join(map(str, d)) + "\n") 4327 4328 # Tmp files 4329 tmp_annotation_vcf = NamedTemporaryFile( 4330 prefix=self.get_prefix(), 4331 dir=self.get_tmp_dir(), 4332 suffix=".vcf.gz", 4333 delete=False, 4334 ) 4335 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4336 tmp_files.append(tmp_annotation_vcf_name) 4337 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4338 tmp_annotation_vcf_name_err = ( 4339 tmp_annotation_vcf_name + ".err" 4340 ) 4341 err_files.append(tmp_annotation_vcf_name_err) 4342 4343 # Annotate Command 4344 log.debug( 4345 f"Annotation '{annotation}' - add bcftools command" 4346 ) 4347 4348 # Command 4349 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4350 4351 # Add command 4352 commands.append(command_annotate) 4353 4354 # if some commands 4355 if commands: 4356 4357 # Export VCF file 4358 self.export_variant_vcf( 4359 vcf_file=tmp_vcf_name, 4360 remove_info=True, 4361 add_samples=False, 4362 index=True, 4363 ) 4364 4365 # Threads 4366 # calculate threads for annotated commands 4367 if commands: 4368 threads_bcftools_annotate = round(threads / len(commands)) 4369 else: 4370 threads_bcftools_annotate = 1 4371 4372 if not threads_bcftools_annotate: 4373 threads_bcftools_annotate = 1 4374 4375 # Add threads option to bcftools commands 4376 if threads_bcftools_annotate > 1: 4377 commands_threaded = [] 4378 for command in commands: 4379 commands_threaded.append( 4380 command.replace( 4381 f"{bcftools_bin_command} annotate ", 4382 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4383 ) 4384 ) 4385 commands = commands_threaded 4386 4387 # Command annotation multithreading 4388 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4389 log.info( 4390 f"Annotation - Annotation multithreaded in " 4391 + str(len(commands)) 4392 + " commands" 4393 ) 4394 4395 run_parallel_commands(commands, threads) 4396 4397 # Merge 4398 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4399 4400 if tmp_ann_vcf_list_cmd: 4401 4402 # Tmp file 4403 tmp_annotate_vcf = NamedTemporaryFile( 4404 prefix=self.get_prefix(), 4405 dir=self.get_tmp_dir(), 4406 suffix=".vcf.gz", 4407 delete=True, 4408 ) 4409 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4410 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4411 err_files.append(tmp_annotate_vcf_name_err) 4412 4413 # Tmp file remove command 4414 tmp_files_remove_command = "" 4415 if tmp_files: 4416 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4417 4418 # Command merge 4419 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4420 log.info( 4421 f"Annotation - Annotation merging " 4422 + str(len(commands)) 4423 + " annotated files" 4424 ) 4425 log.debug(f"Annotation - merge command: {merge_command}") 4426 run_parallel_commands([merge_command], 1) 4427 4428 # Error messages 4429 log.info(f"Error/Warning messages:") 4430 error_message_command_all = [] 4431 error_message_command_warning = [] 4432 error_message_command_err = [] 4433 for err_file in err_files: 4434 with open(err_file, "r") as f: 4435 for line in f: 4436 message = line.strip() 4437 error_message_command_all.append(message) 4438 if line.startswith("[W::"): 4439 error_message_command_warning.append(message) 4440 if line.startswith("[E::"): 4441 error_message_command_err.append( 4442 f"{err_file}: " + message 4443 ) 4444 # log info 4445 for message in list( 4446 set(error_message_command_err + error_message_command_warning) 4447 ): 4448 log.info(f" {message}") 4449 # debug info 4450 for message in list(set(error_message_command_all)): 4451 log.debug(f" {message}") 4452 # failed 4453 if len(error_message_command_err): 4454 log.error("Annotation failed: Error in commands") 4455 raise ValueError("Annotation failed: Error in commands") 4456 4457 # Update variants 4458 log.info(f"Annotation - Updating...") 4459 self.update_from_vcf(tmp_annotate_vcf_name) 4460 4461 def annotation_exomiser(self, threads: int = None) -> None: 4462 """ 4463 This function annotate with Exomiser 4464 4465 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4466 - "analysis" (dict/file): 4467 Full analysis dictionnary parameters (see Exomiser docs). 4468 Either a dict, or a file in JSON or YAML format. 4469 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4470 Default : None 4471 - "preset" (string): 4472 Analysis preset (available in config folder). 4473 Used if no full "analysis" is provided. 4474 Default: "exome" 4475 - "phenopacket" (dict/file): 4476 Samples and phenotipic features parameters (see Exomiser docs). 4477 Either a dict, or a file in JSON or YAML format. 4478 Default: None 4479 - "subject" (dict): 4480 Sample parameters (see Exomiser docs). 4481 Example: 4482 "subject": 4483 { 4484 "id": "ISDBM322017", 4485 "sex": "FEMALE" 4486 } 4487 Default: None 4488 - "sample" (string): 4489 Sample name to construct "subject" section: 4490 "subject": 4491 { 4492 "id": "<sample>", 4493 "sex": "UNKNOWN_SEX" 4494 } 4495 Default: None 4496 - "phenotypicFeatures" (dict) 4497 Phenotypic features to construct "subject" section. 4498 Example: 4499 "phenotypicFeatures": 4500 [ 4501 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4502 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4503 ] 4504 - "hpo" (list) 4505 List of HPO ids as phenotypic features. 4506 Example: 4507 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4508 Default: [] 4509 - "outputOptions" (dict): 4510 Output options (see Exomiser docs). 4511 Default: 4512 "output_options" = 4513 { 4514 "outputContributingVariantsOnly": False, 4515 "numGenes": 0, 4516 "outputFormats": ["TSV_VARIANT", "VCF"] 4517 } 4518 - "transcript_source" (string): 4519 Transcript source (either "refseq", "ucsc", "ensembl") 4520 Default: "refseq" 4521 - "exomiser_to_info" (boolean): 4522 Add exomiser TSV file columns as INFO fields in VCF. 4523 Default: False 4524 - "release" (string): 4525 Exomise database release. 4526 If not exists, database release will be downloaded (take a while). 4527 Default: None (provided by application.properties configuration file) 4528 - "exomiser_application_properties" (file): 4529 Exomiser configuration file (see Exomiser docs). 4530 Useful to automatically download databases (especially for specific genome databases). 4531 4532 Notes: 4533 - If no sample in parameters, first sample in VCF will be chosen 4534 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4535 4536 :param threads: The number of threads to use 4537 :return: None. 4538 """ 4539 4540 # DEBUG 4541 log.debug("Start annotation with Exomiser databases") 4542 4543 # Threads 4544 if not threads: 4545 threads = self.get_threads() 4546 log.debug("Threads: " + str(threads)) 4547 4548 # Config 4549 config = self.get_config() 4550 log.debug("Config: " + str(config)) 4551 4552 # Config - Folders - Databases 4553 databases_folders = ( 4554 config.get("folders", {}) 4555 .get("databases", {}) 4556 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4557 ) 4558 databases_folders = full_path(databases_folders) 4559 if not os.path.exists(databases_folders): 4560 log.error(f"Databases annotations: {databases_folders} NOT found") 4561 log.debug("Databases annotations: " + str(databases_folders)) 4562 4563 # Config - Exomiser 4564 exomiser_bin_command = get_bin_command( 4565 bin="exomiser-cli*.jar", 4566 tool="exomiser", 4567 bin_type="jar", 4568 config=config, 4569 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4570 ) 4571 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4572 if not exomiser_bin_command: 4573 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4574 log.error(msg_err) 4575 raise ValueError(msg_err) 4576 4577 # Param 4578 param = self.get_param() 4579 log.debug("Param: " + str(param)) 4580 4581 # Param - Exomiser 4582 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4583 log.debug(f"Param Exomiser: {param_exomiser}") 4584 4585 # Param - Assembly 4586 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4587 log.debug("Assembly: " + str(assembly)) 4588 4589 # Data 4590 table_variants = self.get_table_variants() 4591 4592 # Check if not empty 4593 log.debug("Check if not empty") 4594 sql_query_chromosomes = ( 4595 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4596 ) 4597 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4598 log.info(f"VCF empty") 4599 return False 4600 4601 # VCF header 4602 vcf_reader = self.get_header() 4603 log.debug("Initial header: " + str(vcf_reader.infos)) 4604 4605 # Samples 4606 samples = self.get_header_sample_list() 4607 if not samples: 4608 log.error("No Samples in VCF") 4609 return False 4610 log.debug(f"Samples: {samples}") 4611 4612 # Memory limit 4613 memory_limit = self.get_memory("8G") 4614 log.debug(f"memory_limit: {memory_limit}") 4615 4616 # Exomiser java options 4617 exomiser_java_options = ( 4618 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4619 ) 4620 log.debug(f"Exomiser java options: {exomiser_java_options}") 4621 4622 # Download Exomiser (if not exists) 4623 exomiser_release = param_exomiser.get("release", None) 4624 exomiser_application_properties = param_exomiser.get( 4625 "exomiser_application_properties", None 4626 ) 4627 databases_download_exomiser( 4628 assemblies=[assembly], 4629 exomiser_folder=databases_folders, 4630 exomiser_release=exomiser_release, 4631 exomiser_phenotype_release=exomiser_release, 4632 exomiser_application_properties=exomiser_application_properties, 4633 ) 4634 4635 # Force annotation 4636 force_update_annotation = True 4637 4638 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4639 log.debug("Start annotation Exomiser") 4640 4641 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4642 4643 # tmp_dir = "/tmp/exomiser" 4644 4645 ### ANALYSIS ### 4646 ################ 4647 4648 # Create analysis.json through analysis dict 4649 # either analysis in param or by default 4650 # depending on preset exome/genome) 4651 4652 # Init analysis dict 4653 param_exomiser_analysis_dict = {} 4654 4655 # analysis from param 4656 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4657 param_exomiser_analysis = full_path(param_exomiser_analysis) 4658 4659 # If analysis in param -> load anlaysis json 4660 if param_exomiser_analysis: 4661 4662 # If param analysis is a file and exists 4663 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4664 param_exomiser_analysis 4665 ): 4666 # Load analysis file into analysis dict (either yaml or json) 4667 with open(param_exomiser_analysis) as json_file: 4668 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4669 4670 # If param analysis is a dict 4671 elif isinstance(param_exomiser_analysis, dict): 4672 # Load analysis dict into analysis dict (either yaml or json) 4673 param_exomiser_analysis_dict = param_exomiser_analysis 4674 4675 # Error analysis type 4676 else: 4677 log.error(f"Analysis type unknown. Check param file.") 4678 raise ValueError(f"Analysis type unknown. Check param file.") 4679 4680 # Case no input analysis config file/dict 4681 # Use preset (exome/genome) to open default config file 4682 if not param_exomiser_analysis_dict: 4683 4684 # default preset 4685 default_preset = "exome" 4686 4687 # Get param preset or default preset 4688 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4689 4690 # Try to find if preset is a file 4691 if os.path.exists(param_exomiser_preset): 4692 # Preset file is provided in full path 4693 param_exomiser_analysis_default_config_file = ( 4694 param_exomiser_preset 4695 ) 4696 # elif os.path.exists(full_path(param_exomiser_preset)): 4697 # # Preset file is provided in full path 4698 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4699 elif os.path.exists( 4700 os.path.join(folder_config, param_exomiser_preset) 4701 ): 4702 # Preset file is provided a basename in config folder (can be a path with subfolders) 4703 param_exomiser_analysis_default_config_file = os.path.join( 4704 folder_config, param_exomiser_preset 4705 ) 4706 else: 4707 # Construct preset file 4708 param_exomiser_analysis_default_config_file = os.path.join( 4709 folder_config, 4710 f"preset-{param_exomiser_preset}-analysis.json", 4711 ) 4712 4713 # If preset file exists 4714 param_exomiser_analysis_default_config_file = full_path( 4715 param_exomiser_analysis_default_config_file 4716 ) 4717 if os.path.exists(param_exomiser_analysis_default_config_file): 4718 # Load prest file into analysis dict (either yaml or json) 4719 with open( 4720 param_exomiser_analysis_default_config_file 4721 ) as json_file: 4722 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4723 json_file 4724 ) 4725 4726 # Error preset file 4727 else: 4728 log.error( 4729 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4730 ) 4731 raise ValueError( 4732 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4733 ) 4734 4735 # If no analysis dict created 4736 if not param_exomiser_analysis_dict: 4737 log.error(f"No analysis config") 4738 raise ValueError(f"No analysis config") 4739 4740 # Log 4741 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4742 4743 ### PHENOPACKET ### 4744 ################### 4745 4746 # If no PhenoPacket in analysis dict -> check in param 4747 if "phenopacket" not in param_exomiser_analysis_dict: 4748 4749 # If PhenoPacket in param -> load anlaysis json 4750 if param_exomiser.get("phenopacket", None): 4751 4752 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4753 param_exomiser_phenopacket = full_path( 4754 param_exomiser_phenopacket 4755 ) 4756 4757 # If param phenopacket is a file and exists 4758 if isinstance( 4759 param_exomiser_phenopacket, str 4760 ) and os.path.exists(param_exomiser_phenopacket): 4761 # Load phenopacket file into analysis dict (either yaml or json) 4762 with open(param_exomiser_phenopacket) as json_file: 4763 param_exomiser_analysis_dict["phenopacket"] = ( 4764 yaml.safe_load(json_file) 4765 ) 4766 4767 # If param phenopacket is a dict 4768 elif isinstance(param_exomiser_phenopacket, dict): 4769 # Load phenopacket dict into analysis dict (either yaml or json) 4770 param_exomiser_analysis_dict["phenopacket"] = ( 4771 param_exomiser_phenopacket 4772 ) 4773 4774 # Error phenopacket type 4775 else: 4776 log.error(f"Phenopacket type unknown. Check param file.") 4777 raise ValueError( 4778 f"Phenopacket type unknown. Check param file." 4779 ) 4780 4781 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4782 if "phenopacket" not in param_exomiser_analysis_dict: 4783 4784 # Init PhenoPacket 4785 param_exomiser_analysis_dict["phenopacket"] = { 4786 "id": "analysis", 4787 "proband": {}, 4788 } 4789 4790 ### Add subject ### 4791 4792 # If subject exists 4793 param_exomiser_subject = param_exomiser.get("subject", {}) 4794 4795 # If subject not exists -> found sample ID 4796 if not param_exomiser_subject: 4797 4798 # Found sample ID in param 4799 sample = param_exomiser.get("sample", None) 4800 4801 # Find sample ID (first sample) 4802 if not sample: 4803 sample_list = self.get_header_sample_list() 4804 if len(sample_list) > 0: 4805 sample = sample_list[0] 4806 else: 4807 log.error(f"No sample found") 4808 raise ValueError(f"No sample found") 4809 4810 # Create subject 4811 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4812 4813 # Add to dict 4814 param_exomiser_analysis_dict["phenopacket"][ 4815 "subject" 4816 ] = param_exomiser_subject 4817 4818 ### Add "phenotypicFeatures" ### 4819 4820 # If phenotypicFeatures exists 4821 param_exomiser_phenotypicfeatures = param_exomiser.get( 4822 "phenotypicFeatures", [] 4823 ) 4824 4825 # If phenotypicFeatures not exists -> Try to infer from hpo list 4826 if not param_exomiser_phenotypicfeatures: 4827 4828 # Found HPO in param 4829 param_exomiser_hpo = param_exomiser.get("hpo", []) 4830 4831 # Split HPO if list in string format separated by comma 4832 if isinstance(param_exomiser_hpo, str): 4833 param_exomiser_hpo = param_exomiser_hpo.split(",") 4834 4835 # Create HPO list 4836 for hpo in param_exomiser_hpo: 4837 hpo_clean = re.sub("[^0-9]", "", hpo) 4838 param_exomiser_phenotypicfeatures.append( 4839 { 4840 "type": { 4841 "id": f"HP:{hpo_clean}", 4842 "label": f"HP:{hpo_clean}", 4843 } 4844 } 4845 ) 4846 4847 # Add to dict 4848 param_exomiser_analysis_dict["phenopacket"][ 4849 "phenotypicFeatures" 4850 ] = param_exomiser_phenotypicfeatures 4851 4852 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4853 if not param_exomiser_phenotypicfeatures: 4854 for step in param_exomiser_analysis_dict.get( 4855 "analysis", {} 4856 ).get("steps", []): 4857 if "hiPhivePrioritiser" in step: 4858 param_exomiser_analysis_dict.get("analysis", {}).get( 4859 "steps", [] 4860 ).remove(step) 4861 4862 ### Add Input File ### 4863 4864 # Initial file name and htsFiles 4865 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4866 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4867 { 4868 "uri": tmp_vcf_name, 4869 "htsFormat": "VCF", 4870 "genomeAssembly": assembly, 4871 } 4872 ] 4873 4874 ### Add metaData ### 4875 4876 # If metaData not in analysis dict 4877 if "metaData" not in param_exomiser_analysis_dict: 4878 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4879 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4880 "createdBy": "howard", 4881 "phenopacketSchemaVersion": 1, 4882 } 4883 4884 ### OutputOptions ### 4885 4886 # Init output result folder 4887 output_results = os.path.join(tmp_dir, "results") 4888 4889 # If no outputOptions in analysis dict 4890 if "outputOptions" not in param_exomiser_analysis_dict: 4891 4892 # default output formats 4893 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4894 4895 # Get outputOptions in param 4896 output_options = param_exomiser.get("outputOptions", None) 4897 4898 # If no output_options in param -> check 4899 if not output_options: 4900 output_options = { 4901 "outputContributingVariantsOnly": False, 4902 "numGenes": 0, 4903 "outputFormats": defaut_output_formats, 4904 } 4905 4906 # Replace outputDirectory in output options 4907 output_options["outputDirectory"] = output_results 4908 output_options["outputFileName"] = "howard" 4909 4910 # Add outputOptions in analysis dict 4911 param_exomiser_analysis_dict["outputOptions"] = output_options 4912 4913 else: 4914 4915 # Replace output_results and output format (if exists in param) 4916 param_exomiser_analysis_dict["outputOptions"][ 4917 "outputDirectory" 4918 ] = output_results 4919 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4920 list( 4921 set( 4922 param_exomiser_analysis_dict.get( 4923 "outputOptions", {} 4924 ).get("outputFormats", []) 4925 + ["TSV_VARIANT", "VCF"] 4926 ) 4927 ) 4928 ) 4929 4930 # log 4931 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4932 4933 ### ANALYSIS FILE ### 4934 ##################### 4935 4936 ### Full JSON analysis config file ### 4937 4938 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4939 with open(exomiser_analysis, "w") as fp: 4940 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4941 4942 ### SPLIT analysis and sample config files 4943 4944 # Splitted analysis dict 4945 param_exomiser_analysis_dict_for_split = ( 4946 param_exomiser_analysis_dict.copy() 4947 ) 4948 4949 # Phenopacket JSON file 4950 exomiser_analysis_phenopacket = os.path.join( 4951 tmp_dir, "analysis_phenopacket.json" 4952 ) 4953 with open(exomiser_analysis_phenopacket, "w") as fp: 4954 json.dump( 4955 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4956 fp, 4957 indent=4, 4958 ) 4959 4960 # Analysis JSON file without Phenopacket parameters 4961 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4962 exomiser_analysis_analysis = os.path.join( 4963 tmp_dir, "analysis_analysis.json" 4964 ) 4965 with open(exomiser_analysis_analysis, "w") as fp: 4966 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4967 4968 ### INITAL VCF file ### 4969 ####################### 4970 4971 ### Create list of samples to use and include inti initial VCF file #### 4972 4973 # Subject (main sample) 4974 # Get sample ID in analysis dict 4975 sample_subject = ( 4976 param_exomiser_analysis_dict.get("phenopacket", {}) 4977 .get("subject", {}) 4978 .get("id", None) 4979 ) 4980 sample_proband = ( 4981 param_exomiser_analysis_dict.get("phenopacket", {}) 4982 .get("proband", {}) 4983 .get("subject", {}) 4984 .get("id", None) 4985 ) 4986 sample = [] 4987 if sample_subject: 4988 sample.append(sample_subject) 4989 if sample_proband: 4990 sample.append(sample_proband) 4991 4992 # Get sample ID within Pedigree 4993 pedigree_persons_list = ( 4994 param_exomiser_analysis_dict.get("phenopacket", {}) 4995 .get("pedigree", {}) 4996 .get("persons", {}) 4997 ) 4998 4999 # Create list with all sample ID in pedigree (if exists) 5000 pedigree_persons = [] 5001 for person in pedigree_persons_list: 5002 pedigree_persons.append(person.get("individualId")) 5003 5004 # Concat subject sample ID and samples ID in pedigreesamples 5005 samples = list(set(sample + pedigree_persons)) 5006 5007 # Check if sample list is not empty 5008 if not samples: 5009 log.error(f"No samples found") 5010 raise ValueError(f"No samples found") 5011 5012 # Create VCF with sample (either sample in param or first one by default) 5013 # Export VCF file 5014 self.export_variant_vcf( 5015 vcf_file=tmp_vcf_name, 5016 remove_info=True, 5017 add_samples=True, 5018 list_samples=samples, 5019 index=False, 5020 ) 5021 5022 ### Execute Exomiser ### 5023 ######################## 5024 5025 # Init command 5026 exomiser_command = "" 5027 5028 # Command exomiser options 5029 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5030 5031 # Release 5032 exomiser_release = param_exomiser.get("release", None) 5033 if exomiser_release: 5034 # phenotype data version 5035 exomiser_options += ( 5036 f" --exomiser.phenotype.data-version={exomiser_release} " 5037 ) 5038 # data version 5039 exomiser_options += ( 5040 f" --exomiser.{assembly}.data-version={exomiser_release} " 5041 ) 5042 # variant white list 5043 variant_white_list_file = ( 5044 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5045 ) 5046 if os.path.exists( 5047 os.path.join( 5048 databases_folders, assembly, variant_white_list_file 5049 ) 5050 ): 5051 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5052 5053 # transcript_source 5054 transcript_source = param_exomiser.get( 5055 "transcript_source", None 5056 ) # ucsc, refseq, ensembl 5057 if transcript_source: 5058 exomiser_options += ( 5059 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5060 ) 5061 5062 # If analysis contain proband param 5063 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5064 "proband", {} 5065 ): 5066 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5067 5068 # If no proband (usually uniq sample) 5069 else: 5070 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5071 5072 # Log 5073 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5074 5075 # Run command 5076 result = subprocess.call( 5077 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5078 ) 5079 if result: 5080 log.error("Exomiser command failed") 5081 raise ValueError("Exomiser command failed") 5082 5083 ### RESULTS ### 5084 ############### 5085 5086 ### Annotate with TSV fields ### 5087 5088 # Init result tsv file 5089 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5090 5091 # Init result tsv file 5092 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5093 5094 # Parse TSV file and explode columns in INFO field 5095 if exomiser_to_info and os.path.exists(output_results_tsv): 5096 5097 # Log 5098 log.debug("Exomiser columns to VCF INFO field") 5099 5100 # Retrieve columns and types 5101 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5102 output_results_tsv_df = self.get_query_to_df(query) 5103 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5104 5105 # Init concat fields for update 5106 sql_query_update_concat_fields = [] 5107 5108 # Fields to avoid 5109 fields_to_avoid = [ 5110 "CONTIG", 5111 "START", 5112 "END", 5113 "REF", 5114 "ALT", 5115 "QUAL", 5116 "FILTER", 5117 "GENOTYPE", 5118 ] 5119 5120 # List all columns to add into header 5121 for header_column in output_results_tsv_columns: 5122 5123 # If header column is enable 5124 if header_column not in fields_to_avoid: 5125 5126 # Header info type 5127 header_info_type = "String" 5128 header_column_df = output_results_tsv_df[header_column] 5129 header_column_df_dtype = header_column_df.dtype 5130 if header_column_df_dtype == object: 5131 if ( 5132 pd.to_numeric(header_column_df, errors="coerce") 5133 .notnull() 5134 .all() 5135 ): 5136 header_info_type = "Float" 5137 else: 5138 header_info_type = "Integer" 5139 5140 # Header info 5141 characters_to_validate = ["-"] 5142 pattern = "[" + "".join(characters_to_validate) + "]" 5143 header_info_name = re.sub( 5144 pattern, 5145 "_", 5146 f"Exomiser_{header_column}".replace("#", ""), 5147 ) 5148 header_info_number = "." 5149 header_info_description = ( 5150 f"Exomiser {header_column} annotation" 5151 ) 5152 header_info_source = "Exomiser" 5153 header_info_version = "unknown" 5154 header_info_code = CODE_TYPE_MAP[header_info_type] 5155 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5156 header_info_name, 5157 header_info_number, 5158 header_info_type, 5159 header_info_description, 5160 header_info_source, 5161 header_info_version, 5162 header_info_code, 5163 ) 5164 5165 # Add field to add for update to concat fields 5166 sql_query_update_concat_fields.append( 5167 f""" 5168 CASE 5169 WHEN table_parquet."{header_column}" NOT IN ('','.') 5170 THEN concat( 5171 '{header_info_name}=', 5172 table_parquet."{header_column}", 5173 ';' 5174 ) 5175 5176 ELSE '' 5177 END 5178 """ 5179 ) 5180 5181 # Update query 5182 sql_query_update = f""" 5183 UPDATE {table_variants} as table_variants 5184 SET INFO = concat( 5185 CASE 5186 WHEN INFO NOT IN ('', '.') 5187 THEN INFO 5188 ELSE '' 5189 END, 5190 CASE 5191 WHEN table_variants.INFO NOT IN ('','.') 5192 THEN ';' 5193 ELSE '' 5194 END, 5195 ( 5196 SELECT 5197 concat( 5198 {",".join(sql_query_update_concat_fields)} 5199 ) 5200 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5201 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5202 AND table_parquet.\"START\" = table_variants.\"POS\" 5203 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5204 AND table_parquet.\"REF\" = table_variants.\"REF\" 5205 ) 5206 ) 5207 ; 5208 """ 5209 5210 # Update 5211 self.conn.execute(sql_query_update) 5212 5213 ### Annotate with VCF INFO field ### 5214 5215 # Init result VCF file 5216 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5217 5218 # If VCF exists 5219 if os.path.exists(output_results_vcf): 5220 5221 # Log 5222 log.debug("Exomiser result VCF update variants") 5223 5224 # Find Exomiser INFO field annotation in header 5225 with gzip.open(output_results_vcf, "rt") as f: 5226 header_list = self.read_vcf_header(f) 5227 exomiser_vcf_header = vcf.Reader( 5228 io.StringIO("\n".join(header_list)) 5229 ) 5230 5231 # Add annotation INFO field to header 5232 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5233 5234 # Update variants with VCF 5235 self.update_from_vcf(output_results_vcf) 5236 5237 return True 5238 5239 def annotation_snpeff(self, threads: int = None) -> None: 5240 """ 5241 This function annotate with snpEff 5242 5243 :param threads: The number of threads to use 5244 :return: the value of the variable "return_value". 5245 """ 5246 5247 # DEBUG 5248 log.debug("Start annotation with snpeff databases") 5249 5250 # Threads 5251 if not threads: 5252 threads = self.get_threads() 5253 log.debug("Threads: " + str(threads)) 5254 5255 # DEBUG 5256 delete_tmp = True 5257 if self.get_config().get("verbosity", "warning") in ["debug"]: 5258 delete_tmp = False 5259 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5260 5261 # Config 5262 config = self.get_config() 5263 log.debug("Config: " + str(config)) 5264 5265 # Config - Folders - Databases 5266 databases_folders = ( 5267 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5268 ) 5269 log.debug("Databases annotations: " + str(databases_folders)) 5270 5271 # Config - snpEff bin command 5272 snpeff_bin_command = get_bin_command( 5273 bin="snpEff.jar", 5274 tool="snpeff", 5275 bin_type="jar", 5276 config=config, 5277 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5278 ) 5279 if not snpeff_bin_command: 5280 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5281 log.error(msg_err) 5282 raise ValueError(msg_err) 5283 5284 # Config - snpEff databases 5285 snpeff_databases = ( 5286 config.get("folders", {}) 5287 .get("databases", {}) 5288 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5289 ) 5290 snpeff_databases = full_path(snpeff_databases) 5291 if snpeff_databases is not None and snpeff_databases != "": 5292 log.debug(f"Create snpEff databases folder") 5293 if not os.path.exists(snpeff_databases): 5294 os.makedirs(snpeff_databases) 5295 5296 # Param 5297 param = self.get_param() 5298 log.debug("Param: " + str(param)) 5299 5300 # Param 5301 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5302 log.debug("Options: " + str(options)) 5303 5304 # Param - Assembly 5305 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5306 5307 # Param - Options 5308 snpeff_options = ( 5309 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5310 ) 5311 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5312 snpeff_csvstats = ( 5313 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5314 ) 5315 if snpeff_stats: 5316 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5317 snpeff_stats = full_path(snpeff_stats) 5318 snpeff_options += f" -stats {snpeff_stats}" 5319 if snpeff_csvstats: 5320 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5321 snpeff_csvstats = full_path(snpeff_csvstats) 5322 snpeff_options += f" -csvStats {snpeff_csvstats}" 5323 5324 # Data 5325 table_variants = self.get_table_variants() 5326 5327 # Check if not empty 5328 log.debug("Check if not empty") 5329 sql_query_chromosomes = ( 5330 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5331 ) 5332 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5333 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5334 log.info(f"VCF empty") 5335 return 5336 5337 # Export in VCF 5338 log.debug("Create initial file to annotate") 5339 tmp_vcf = NamedTemporaryFile( 5340 prefix=self.get_prefix(), 5341 dir=self.get_tmp_dir(), 5342 suffix=".vcf.gz", 5343 delete=True, 5344 ) 5345 tmp_vcf_name = tmp_vcf.name 5346 5347 # VCF header 5348 vcf_reader = self.get_header() 5349 log.debug("Initial header: " + str(vcf_reader.infos)) 5350 5351 # Existing annotations 5352 for vcf_annotation in self.get_header().infos: 5353 5354 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5355 log.debug( 5356 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5357 ) 5358 5359 # Memory limit 5360 # if config.get("memory", None): 5361 # memory_limit = config.get("memory", "8G") 5362 # else: 5363 # memory_limit = "8G" 5364 memory_limit = self.get_memory("8G") 5365 log.debug(f"memory_limit: {memory_limit}") 5366 5367 # snpEff java options 5368 snpeff_java_options = ( 5369 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5370 ) 5371 log.debug(f"Exomiser java options: {snpeff_java_options}") 5372 5373 force_update_annotation = True 5374 5375 if "ANN" not in self.get_header().infos or force_update_annotation: 5376 5377 # Check snpEff database 5378 log.debug(f"Check snpEff databases {[assembly]}") 5379 databases_download_snpeff( 5380 folder=snpeff_databases, assemblies=[assembly], config=config 5381 ) 5382 5383 # Export VCF file 5384 self.export_variant_vcf( 5385 vcf_file=tmp_vcf_name, 5386 remove_info=True, 5387 add_samples=False, 5388 index=True, 5389 ) 5390 5391 # Tmp file 5392 err_files = [] 5393 tmp_annotate_vcf = NamedTemporaryFile( 5394 prefix=self.get_prefix(), 5395 dir=self.get_tmp_dir(), 5396 suffix=".vcf", 5397 delete=False, 5398 ) 5399 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5400 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5401 err_files.append(tmp_annotate_vcf_name_err) 5402 5403 # Command 5404 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5405 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5406 run_parallel_commands([snpeff_command], 1) 5407 5408 # Error messages 5409 log.info(f"Error/Warning messages:") 5410 error_message_command_all = [] 5411 error_message_command_warning = [] 5412 error_message_command_err = [] 5413 for err_file in err_files: 5414 with open(err_file, "r") as f: 5415 for line in f: 5416 message = line.strip() 5417 error_message_command_all.append(message) 5418 if line.startswith("[W::"): 5419 error_message_command_warning.append(message) 5420 if line.startswith("[E::"): 5421 error_message_command_err.append(f"{err_file}: " + message) 5422 # log info 5423 for message in list( 5424 set(error_message_command_err + error_message_command_warning) 5425 ): 5426 log.info(f" {message}") 5427 # debug info 5428 for message in list(set(error_message_command_all)): 5429 log.debug(f" {message}") 5430 # failed 5431 if len(error_message_command_err): 5432 log.error("Annotation failed: Error in commands") 5433 raise ValueError("Annotation failed: Error in commands") 5434 5435 # Find annotation in header 5436 with open(tmp_annotate_vcf_name, "rt") as f: 5437 header_list = self.read_vcf_header(f) 5438 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5439 5440 for ann in annovar_vcf_header.infos: 5441 if ann not in self.get_header().infos: 5442 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5443 5444 # Update variants 5445 log.info(f"Annotation - Updating...") 5446 self.update_from_vcf(tmp_annotate_vcf_name) 5447 5448 else: 5449 if "ANN" in self.get_header().infos: 5450 log.debug(f"Existing snpEff annotations in VCF") 5451 if force_update_annotation: 5452 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5453 5454 def annotation_annovar(self, threads: int = None) -> None: 5455 """ 5456 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5457 annotations 5458 5459 :param threads: number of threads to use 5460 :return: the value of the variable "return_value". 5461 """ 5462 5463 # DEBUG 5464 log.debug("Start annotation with Annovar databases") 5465 5466 # Threads 5467 if not threads: 5468 threads = self.get_threads() 5469 log.debug("Threads: " + str(threads)) 5470 5471 # Tmp en Err files 5472 tmp_files = [] 5473 err_files = [] 5474 5475 # DEBUG 5476 delete_tmp = True 5477 if self.get_config().get("verbosity", "warning") in ["debug"]: 5478 delete_tmp = False 5479 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5480 5481 # Config 5482 config = self.get_config() 5483 log.debug("Config: " + str(config)) 5484 5485 # Config - Folders - Databases 5486 databases_folders = ( 5487 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5488 ) 5489 log.debug("Databases annotations: " + str(databases_folders)) 5490 5491 # Config - annovar bin command 5492 annovar_bin_command = get_bin_command( 5493 bin="table_annovar.pl", 5494 tool="annovar", 5495 bin_type="perl", 5496 config=config, 5497 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5498 ) 5499 if not annovar_bin_command: 5500 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5501 log.error(msg_err) 5502 raise ValueError(msg_err) 5503 5504 # Config - BCFTools bin command 5505 bcftools_bin_command = get_bin_command( 5506 bin="bcftools", 5507 tool="bcftools", 5508 bin_type="bin", 5509 config=config, 5510 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5511 ) 5512 if not bcftools_bin_command: 5513 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5514 log.error(msg_err) 5515 raise ValueError(msg_err) 5516 5517 # Config - annovar databases 5518 annovar_databases = ( 5519 config.get("folders", {}) 5520 .get("databases", {}) 5521 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5522 ) 5523 if annovar_databases is not None: 5524 if isinstance(annovar_databases, list): 5525 annovar_databases = full_path(annovar_databases[0]) 5526 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5527 annovar_databases = full_path(annovar_databases) 5528 if not os.path.exists(annovar_databases): 5529 log.info(f"Annovar databases folder '{annovar_databases}' created") 5530 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5531 else: 5532 msg_err = f"Annovar databases configuration failed" 5533 log.error(msg_err) 5534 raise ValueError(msg_err) 5535 5536 # Param 5537 param = self.get_param() 5538 log.debug("Param: " + str(param)) 5539 5540 # Param - options 5541 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5542 log.debug("Options: " + str(options)) 5543 5544 # Param - annotations 5545 annotations = ( 5546 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5547 ) 5548 log.debug("Annotations: " + str(annotations)) 5549 5550 # Param - Assembly 5551 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5552 5553 # Annovar database assembly 5554 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5555 if annovar_databases_assembly != "" and not os.path.exists( 5556 annovar_databases_assembly 5557 ): 5558 os.makedirs(annovar_databases_assembly) 5559 5560 # Data 5561 table_variants = self.get_table_variants() 5562 5563 # Check if not empty 5564 log.debug("Check if not empty") 5565 sql_query_chromosomes = ( 5566 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5567 ) 5568 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5569 if not sql_query_chromosomes_df["count"][0]: 5570 log.info(f"VCF empty") 5571 return 5572 5573 # VCF header 5574 vcf_reader = self.get_header() 5575 log.debug("Initial header: " + str(vcf_reader.infos)) 5576 5577 # Existing annotations 5578 for vcf_annotation in self.get_header().infos: 5579 5580 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5581 log.debug( 5582 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5583 ) 5584 5585 force_update_annotation = True 5586 5587 if annotations: 5588 5589 commands = [] 5590 tmp_annotates_vcf_name_list = [] 5591 5592 # Export in VCF 5593 log.debug("Create initial file to annotate") 5594 tmp_vcf = NamedTemporaryFile( 5595 prefix=self.get_prefix(), 5596 dir=self.get_tmp_dir(), 5597 suffix=".vcf.gz", 5598 delete=False, 5599 ) 5600 tmp_vcf_name = tmp_vcf.name 5601 tmp_files.append(tmp_vcf_name) 5602 tmp_files.append(tmp_vcf_name + ".tbi") 5603 5604 # Export VCF file 5605 self.export_variant_vcf( 5606 vcf_file=tmp_vcf_name, 5607 remove_info=".", 5608 add_samples=False, 5609 index=True, 5610 ) 5611 5612 # Create file for field rename 5613 log.debug("Create file for field rename") 5614 tmp_rename = NamedTemporaryFile( 5615 prefix=self.get_prefix(), 5616 dir=self.get_tmp_dir(), 5617 suffix=".rename", 5618 delete=False, 5619 ) 5620 tmp_rename_name = tmp_rename.name 5621 tmp_files.append(tmp_rename_name) 5622 5623 # Check Annovar database 5624 log.debug( 5625 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5626 ) 5627 databases_download_annovar( 5628 folder=annovar_databases, 5629 files=list(annotations.keys()), 5630 assemblies=[assembly], 5631 ) 5632 5633 for annotation in annotations: 5634 annotation_fields = annotations[annotation] 5635 5636 if not annotation_fields: 5637 annotation_fields = {"INFO": None} 5638 5639 log.info(f"Annotations Annovar - database '{annotation}'") 5640 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5641 5642 # Tmp file for annovar 5643 err_files = [] 5644 tmp_annotate_vcf_directory = TemporaryDirectory( 5645 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5646 ) 5647 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5648 tmp_annotate_vcf_name_annovar = ( 5649 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5650 ) 5651 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5652 err_files.append(tmp_annotate_vcf_name_err) 5653 tmp_files.append(tmp_annotate_vcf_name_err) 5654 5655 # Tmp file final vcf annotated by annovar 5656 tmp_annotate_vcf = NamedTemporaryFile( 5657 prefix=self.get_prefix(), 5658 dir=self.get_tmp_dir(), 5659 suffix=".vcf.gz", 5660 delete=False, 5661 ) 5662 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5663 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5664 tmp_files.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5666 5667 # Number of fields 5668 annotation_list = [] 5669 annotation_renamed_list = [] 5670 5671 for annotation_field in annotation_fields: 5672 5673 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5674 annotation_fields_new_name = annotation_fields.get( 5675 annotation_field, annotation_field 5676 ) 5677 if not annotation_fields_new_name: 5678 annotation_fields_new_name = annotation_field 5679 5680 if ( 5681 force_update_annotation 5682 or annotation_fields_new_name not in self.get_header().infos 5683 ): 5684 annotation_list.append(annotation_field) 5685 annotation_renamed_list.append(annotation_fields_new_name) 5686 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5687 log.warning( 5688 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5689 ) 5690 5691 # Add rename info 5692 run_parallel_commands( 5693 [ 5694 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5695 ], 5696 1, 5697 ) 5698 5699 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5700 log.debug("annotation_list: " + str(annotation_list)) 5701 5702 # protocol 5703 protocol = annotation 5704 5705 # argument 5706 argument = "" 5707 5708 # operation 5709 operation = "f" 5710 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5711 "ensGene" 5712 ): 5713 operation = "g" 5714 if options.get("genebase", None): 5715 argument = f"""'{options.get("genebase","")}'""" 5716 elif annotation in ["cytoBand"]: 5717 operation = "r" 5718 5719 # argument option 5720 argument_option = "" 5721 if argument != "": 5722 argument_option = " --argument " + argument 5723 5724 # command options 5725 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5726 for option in options: 5727 if option not in ["genebase"]: 5728 command_options += f""" --{option}={options[option]}""" 5729 5730 # Command 5731 5732 # Command - Annovar 5733 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5734 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5735 5736 # Command - start pipe 5737 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5738 5739 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5740 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5741 5742 # Command - Special characters (refGene annotation) 5743 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5744 5745 # Command - Clean empty fields (with value ".") 5746 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5747 5748 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5749 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5750 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5751 # for ann in annotation_renamed_list: 5752 for ann in annotation_list: 5753 annovar_fields_to_keep.append(f"^INFO/{ann}") 5754 5755 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5756 5757 # Command - indexing 5758 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5759 5760 log.debug(f"Annotation - Annovar command: {command_annovar}") 5761 run_parallel_commands([command_annovar], 1) 5762 5763 # Error messages 5764 log.info(f"Error/Warning messages:") 5765 error_message_command_all = [] 5766 error_message_command_warning = [] 5767 error_message_command_err = [] 5768 for err_file in err_files: 5769 with open(err_file, "r") as f: 5770 for line in f: 5771 message = line.strip() 5772 error_message_command_all.append(message) 5773 if line.startswith("[W::") or line.startswith("WARNING"): 5774 error_message_command_warning.append(message) 5775 if line.startswith("[E::") or line.startswith("ERROR"): 5776 error_message_command_err.append( 5777 f"{err_file}: " + message 5778 ) 5779 # log info 5780 for message in list( 5781 set(error_message_command_err + error_message_command_warning) 5782 ): 5783 log.info(f" {message}") 5784 # debug info 5785 for message in list(set(error_message_command_all)): 5786 log.debug(f" {message}") 5787 # failed 5788 if len(error_message_command_err): 5789 log.error("Annotation failed: Error in commands") 5790 raise ValueError("Annotation failed: Error in commands") 5791 5792 if tmp_annotates_vcf_name_list: 5793 5794 # List of annotated files 5795 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5796 5797 # Tmp file 5798 tmp_annotate_vcf = NamedTemporaryFile( 5799 prefix=self.get_prefix(), 5800 dir=self.get_tmp_dir(), 5801 suffix=".vcf.gz", 5802 delete=False, 5803 ) 5804 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5805 tmp_files.append(tmp_annotate_vcf_name) 5806 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5807 err_files.append(tmp_annotate_vcf_name_err) 5808 tmp_files.append(tmp_annotate_vcf_name_err) 5809 5810 # Command merge 5811 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5812 log.info( 5813 f"Annotation Annovar - Annotation merging " 5814 + str(len(tmp_annotates_vcf_name_list)) 5815 + " annotated files" 5816 ) 5817 log.debug(f"Annotation - merge command: {merge_command}") 5818 run_parallel_commands([merge_command], 1) 5819 5820 # Find annotation in header 5821 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5822 header_list = self.read_vcf_header(f) 5823 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5824 5825 for ann in annovar_vcf_header.infos: 5826 if ann not in self.get_header().infos: 5827 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5828 5829 # Update variants 5830 log.info(f"Annotation Annovar - Updating...") 5831 self.update_from_vcf(tmp_annotate_vcf_name) 5832 5833 # Clean files 5834 # Tmp file remove command 5835 if True: 5836 tmp_files_remove_command = "" 5837 if tmp_files: 5838 tmp_files_remove_command = " ".join(tmp_files) 5839 clean_command = f" rm -f {tmp_files_remove_command} " 5840 log.debug(f"Annotation Annovar - Annotation cleaning ") 5841 log.debug(f"Annotation - cleaning command: {clean_command}") 5842 run_parallel_commands([clean_command], 1) 5843 5844 # Parquet 5845 def annotation_parquet(self, threads: int = None) -> None: 5846 """ 5847 It takes a VCF file, and annotates it with a parquet file 5848 5849 :param threads: number of threads to use for the annotation 5850 :return: the value of the variable "result". 5851 """ 5852 5853 # DEBUG 5854 log.debug("Start annotation with parquet databases") 5855 5856 # Threads 5857 if not threads: 5858 threads = self.get_threads() 5859 log.debug("Threads: " + str(threads)) 5860 5861 # DEBUG 5862 delete_tmp = True 5863 if self.get_config().get("verbosity", "warning") in ["debug"]: 5864 delete_tmp = False 5865 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5866 5867 # Config 5868 databases_folders = set( 5869 self.get_config() 5870 .get("folders", {}) 5871 .get("databases", {}) 5872 .get("annotations", ["."]) 5873 + self.get_config() 5874 .get("folders", {}) 5875 .get("databases", {}) 5876 .get("parquet", ["."]) 5877 ) 5878 log.debug("Databases annotations: " + str(databases_folders)) 5879 5880 # Param 5881 annotations = ( 5882 self.get_param() 5883 .get("annotation", {}) 5884 .get("parquet", {}) 5885 .get("annotations", None) 5886 ) 5887 log.debug("Annotations: " + str(annotations)) 5888 5889 # Assembly 5890 assembly = self.get_param().get( 5891 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5892 ) 5893 5894 # Force Update Annotation 5895 force_update_annotation = ( 5896 self.get_param() 5897 .get("annotation", {}) 5898 .get("options", {}) 5899 .get("annotations_update", False) 5900 ) 5901 log.debug(f"force_update_annotation={force_update_annotation}") 5902 force_append_annotation = ( 5903 self.get_param() 5904 .get("annotation", {}) 5905 .get("options", {}) 5906 .get("annotations_append", False) 5907 ) 5908 log.debug(f"force_append_annotation={force_append_annotation}") 5909 5910 # Data 5911 table_variants = self.get_table_variants() 5912 5913 # Check if not empty 5914 log.debug("Check if not empty") 5915 sql_query_chromosomes_df = self.get_query_to_df( 5916 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5917 ) 5918 if not sql_query_chromosomes_df["count"][0]: 5919 log.info(f"VCF empty") 5920 return 5921 5922 # VCF header 5923 vcf_reader = self.get_header() 5924 log.debug("Initial header: " + str(vcf_reader.infos)) 5925 5926 # Nb Variants POS 5927 log.debug("NB Variants Start") 5928 nb_variants = self.conn.execute( 5929 f"SELECT count(*) AS count FROM variants" 5930 ).fetchdf()["count"][0] 5931 log.debug("NB Variants Stop") 5932 5933 # Existing annotations 5934 for vcf_annotation in self.get_header().infos: 5935 5936 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5937 log.debug( 5938 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5939 ) 5940 5941 # Added columns 5942 added_columns = [] 5943 5944 # drop indexes 5945 log.debug(f"Drop indexes...") 5946 self.drop_indexes() 5947 5948 if annotations: 5949 5950 if "ALL" in annotations: 5951 5952 all_param = annotations.get("ALL", {}) 5953 all_param_formats = all_param.get("formats", None) 5954 all_param_releases = all_param.get("releases", None) 5955 5956 databases_infos_dict = self.scan_databases( 5957 database_formats=all_param_formats, 5958 database_releases=all_param_releases, 5959 ) 5960 for database_infos in databases_infos_dict.keys(): 5961 if database_infos not in annotations: 5962 annotations[database_infos] = {"INFO": None} 5963 5964 for annotation in annotations: 5965 5966 if annotation in ["ALL"]: 5967 continue 5968 5969 # Annotation Name 5970 annotation_name = os.path.basename(annotation) 5971 5972 # Annotation fields 5973 annotation_fields = annotations[annotation] 5974 if not annotation_fields: 5975 annotation_fields = {"INFO": None} 5976 5977 log.debug(f"Annotation '{annotation_name}'") 5978 log.debug( 5979 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5980 ) 5981 5982 # Create Database 5983 database = Database( 5984 database=annotation, 5985 databases_folders=databases_folders, 5986 assembly=assembly, 5987 ) 5988 5989 # Find files 5990 parquet_file = database.get_database() 5991 parquet_hdr_file = database.get_header_file() 5992 parquet_type = database.get_type() 5993 5994 # Check if files exists 5995 if not parquet_file or not parquet_hdr_file: 5996 msg_err_list = [] 5997 if not parquet_file: 5998 msg_err_list.append( 5999 f"Annotation failed: Annotation file not found" 6000 ) 6001 if parquet_file and not parquet_hdr_file: 6002 msg_err_list.append( 6003 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6004 ) 6005 6006 log.error(". ".join(msg_err_list)) 6007 raise ValueError(". ".join(msg_err_list)) 6008 else: 6009 # Get parquet connexion 6010 parquet_sql_attach = database.get_sql_database_attach( 6011 output="query" 6012 ) 6013 if parquet_sql_attach: 6014 self.conn.execute(parquet_sql_attach) 6015 parquet_file_link = database.get_sql_database_link() 6016 # Log 6017 log.debug( 6018 f"Annotation '{annotation_name}' - file: " 6019 + str(parquet_file) 6020 + " and " 6021 + str(parquet_hdr_file) 6022 ) 6023 6024 # Database full header columns 6025 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6026 parquet_hdr_file 6027 ) 6028 # Log 6029 log.debug( 6030 "Annotation database header columns : " 6031 + str(parquet_hdr_vcf_header_columns) 6032 ) 6033 6034 # Load header as VCF object 6035 parquet_hdr_vcf_header_infos = database.get_header().infos 6036 # Log 6037 log.debug( 6038 "Annotation database header: " 6039 + str(parquet_hdr_vcf_header_infos) 6040 ) 6041 6042 # Get extra infos 6043 parquet_columns = database.get_extra_columns() 6044 # Log 6045 log.debug("Annotation database Columns: " + str(parquet_columns)) 6046 6047 # Add extra columns if "ALL" in annotation_fields 6048 # if "ALL" in annotation_fields: 6049 # allow_add_extra_column = True 6050 if "ALL" in annotation_fields and database.get_extra_columns(): 6051 for extra_column in database.get_extra_columns(): 6052 if ( 6053 extra_column not in annotation_fields 6054 and extra_column.replace("INFO/", "") 6055 not in parquet_hdr_vcf_header_infos 6056 ): 6057 parquet_hdr_vcf_header_infos[extra_column] = ( 6058 vcf.parser._Info( 6059 extra_column, 6060 ".", 6061 "String", 6062 f"{extra_column} description", 6063 "unknown", 6064 "unknown", 6065 self.code_type_map["String"], 6066 ) 6067 ) 6068 6069 # For all fields in database 6070 annotation_fields_all = False 6071 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6072 annotation_fields_all = True 6073 annotation_fields = { 6074 key: key for key in parquet_hdr_vcf_header_infos 6075 } 6076 6077 log.debug( 6078 "Annotation database header - All annotations added: " 6079 + str(annotation_fields) 6080 ) 6081 6082 # Init 6083 6084 # List of annotation fields to use 6085 sql_query_annotation_update_info_sets = [] 6086 6087 # List of annotation to agregate 6088 sql_query_annotation_to_agregate = [] 6089 6090 # Number of fields 6091 nb_annotation_field = 0 6092 6093 # Annotation fields processed 6094 annotation_fields_processed = [] 6095 6096 # Columns mapping 6097 map_columns = database.map_columns( 6098 columns=annotation_fields, prefixes=["INFO/"] 6099 ) 6100 6101 # Query dict for fields to remove (update option) 6102 query_dict_remove = {} 6103 6104 # Fetch Anotation fields 6105 for annotation_field in annotation_fields: 6106 6107 # annotation_field_column 6108 annotation_field_column = map_columns.get( 6109 annotation_field, "INFO" 6110 ) 6111 6112 # field new name, if parametered 6113 annotation_fields_new_name = annotation_fields.get( 6114 annotation_field, annotation_field 6115 ) 6116 if not annotation_fields_new_name: 6117 annotation_fields_new_name = annotation_field 6118 6119 # To annotate 6120 # force_update_annotation = True 6121 # force_append_annotation = True 6122 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6123 if annotation_field in parquet_hdr_vcf_header_infos and ( 6124 force_update_annotation 6125 or force_append_annotation 6126 or ( 6127 annotation_fields_new_name 6128 not in self.get_header().infos 6129 ) 6130 ): 6131 6132 # Add field to annotation to process list 6133 annotation_fields_processed.append( 6134 annotation_fields_new_name 6135 ) 6136 6137 # explode infos for the field 6138 annotation_fields_new_name_info_msg = "" 6139 if ( 6140 force_update_annotation 6141 and annotation_fields_new_name 6142 in self.get_header().infos 6143 ): 6144 # Remove field from INFO 6145 query = f""" 6146 UPDATE {table_variants} as table_variants 6147 SET INFO = REGEXP_REPLACE( 6148 concat(table_variants.INFO,''), 6149 ';*{annotation_fields_new_name}=[^;]*', 6150 '' 6151 ) 6152 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6153 """ 6154 annotation_fields_new_name_info_msg = " [update]" 6155 query_dict_remove[ 6156 f"remove 'INFO/{annotation_fields_new_name}'" 6157 ] = query 6158 6159 # Sep between fields in INFO 6160 nb_annotation_field += 1 6161 if nb_annotation_field > 1: 6162 annotation_field_sep = ";" 6163 else: 6164 annotation_field_sep = "" 6165 6166 log.info( 6167 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6168 ) 6169 6170 # Add INFO field to header 6171 parquet_hdr_vcf_header_infos_number = ( 6172 parquet_hdr_vcf_header_infos[annotation_field].num 6173 or "." 6174 ) 6175 parquet_hdr_vcf_header_infos_type = ( 6176 parquet_hdr_vcf_header_infos[annotation_field].type 6177 or "String" 6178 ) 6179 parquet_hdr_vcf_header_infos_description = ( 6180 parquet_hdr_vcf_header_infos[annotation_field].desc 6181 or f"{annotation_field} description" 6182 ) 6183 parquet_hdr_vcf_header_infos_source = ( 6184 parquet_hdr_vcf_header_infos[annotation_field].source 6185 or "unknown" 6186 ) 6187 parquet_hdr_vcf_header_infos_version = ( 6188 parquet_hdr_vcf_header_infos[annotation_field].version 6189 or "unknown" 6190 ) 6191 6192 vcf_reader.infos[annotation_fields_new_name] = ( 6193 vcf.parser._Info( 6194 annotation_fields_new_name, 6195 parquet_hdr_vcf_header_infos_number, 6196 parquet_hdr_vcf_header_infos_type, 6197 parquet_hdr_vcf_header_infos_description, 6198 parquet_hdr_vcf_header_infos_source, 6199 parquet_hdr_vcf_header_infos_version, 6200 self.code_type_map[ 6201 parquet_hdr_vcf_header_infos_type 6202 ], 6203 ) 6204 ) 6205 6206 # Append 6207 if force_append_annotation: 6208 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6209 else: 6210 query_case_when_append = "" 6211 6212 # Annotation/Update query fields 6213 # Found in INFO column 6214 if ( 6215 annotation_field_column == "INFO" 6216 and "INFO" in parquet_hdr_vcf_header_columns 6217 ): 6218 sql_query_annotation_update_info_sets.append( 6219 f""" 6220 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6221 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6222 ELSE '' 6223 END 6224 """ 6225 ) 6226 # Found in a specific column 6227 else: 6228 sql_query_annotation_update_info_sets.append( 6229 f""" 6230 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6231 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6232 ELSE '' 6233 END 6234 """ 6235 ) 6236 sql_query_annotation_to_agregate.append( 6237 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6238 ) 6239 6240 # Not to annotate 6241 else: 6242 6243 if force_update_annotation: 6244 annotation_message = "forced" 6245 else: 6246 annotation_message = "skipped" 6247 6248 if annotation_field not in parquet_hdr_vcf_header_infos: 6249 log.warning( 6250 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6251 ) 6252 if annotation_fields_new_name in self.get_header().infos: 6253 log.warning( 6254 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6255 ) 6256 6257 # Check if ALL fields have to be annotated. Thus concat all INFO field 6258 # allow_annotation_full_info = True 6259 allow_annotation_full_info = not force_append_annotation 6260 6261 if parquet_type in ["regions"]: 6262 allow_annotation_full_info = False 6263 6264 if ( 6265 allow_annotation_full_info 6266 and nb_annotation_field == len(annotation_fields) 6267 and annotation_fields_all 6268 and ( 6269 "INFO" in parquet_hdr_vcf_header_columns 6270 and "INFO" in database.get_extra_columns() 6271 ) 6272 ): 6273 log.debug("Column INFO annotation enabled") 6274 sql_query_annotation_update_info_sets = [] 6275 sql_query_annotation_update_info_sets.append( 6276 f" table_parquet.INFO " 6277 ) 6278 6279 if sql_query_annotation_update_info_sets: 6280 6281 # Annotate 6282 log.info(f"Annotation '{annotation_name}' - Annotation...") 6283 6284 # Join query annotation update info sets for SQL 6285 sql_query_annotation_update_info_sets_sql = ",".join( 6286 sql_query_annotation_update_info_sets 6287 ) 6288 6289 # Check chromosomes list (and variants infos) 6290 sql_query_chromosomes = f""" 6291 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6292 FROM {table_variants} as table_variants 6293 GROUP BY table_variants."#CHROM" 6294 ORDER BY table_variants."#CHROM" 6295 """ 6296 sql_query_chromosomes_df = self.conn.execute( 6297 sql_query_chromosomes 6298 ).df() 6299 sql_query_chromosomes_dict = { 6300 entry["CHROM"]: { 6301 "count": entry["count_variants"], 6302 "min": entry["min_variants"], 6303 "max": entry["max_variants"], 6304 } 6305 for index, entry in sql_query_chromosomes_df.iterrows() 6306 } 6307 6308 # Init 6309 nb_of_query = 0 6310 nb_of_variant_annotated = 0 6311 query_dict = query_dict_remove 6312 6313 # for chrom in sql_query_chromosomes_df["CHROM"]: 6314 for chrom in sql_query_chromosomes_dict: 6315 6316 # Number of variant by chromosome 6317 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6318 chrom, {} 6319 ).get("count", 0) 6320 6321 log.debug( 6322 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6323 ) 6324 6325 # Annotation with regions database 6326 if parquet_type in ["regions"]: 6327 sql_query_annotation_from_clause = f""" 6328 FROM ( 6329 SELECT 6330 '{chrom}' AS \"#CHROM\", 6331 table_variants_from.\"POS\" AS \"POS\", 6332 {",".join(sql_query_annotation_to_agregate)} 6333 FROM {table_variants} as table_variants_from 6334 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6335 table_parquet_from."#CHROM" = '{chrom}' 6336 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6337 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6338 ) 6339 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6340 GROUP BY table_variants_from.\"POS\" 6341 ) 6342 as table_parquet 6343 """ 6344 6345 sql_query_annotation_where_clause = """ 6346 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6347 AND table_parquet.\"POS\" = table_variants.\"POS\" 6348 """ 6349 6350 # Annotation with variants database 6351 else: 6352 sql_query_annotation_from_clause = f""" 6353 FROM {parquet_file_link} as table_parquet 6354 """ 6355 sql_query_annotation_where_clause = f""" 6356 table_variants."#CHROM" = '{chrom}' 6357 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6358 AND table_parquet.\"POS\" = table_variants.\"POS\" 6359 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6360 AND table_parquet.\"REF\" = table_variants.\"REF\" 6361 """ 6362 6363 # Create update query 6364 sql_query_annotation_chrom_interval_pos = f""" 6365 UPDATE {table_variants} as table_variants 6366 SET INFO = 6367 concat( 6368 CASE WHEN table_variants.INFO NOT IN ('','.') 6369 THEN table_variants.INFO 6370 ELSE '' 6371 END 6372 , 6373 CASE WHEN table_variants.INFO NOT IN ('','.') 6374 AND ( 6375 concat({sql_query_annotation_update_info_sets_sql}) 6376 ) 6377 NOT IN ('','.') 6378 THEN ';' 6379 ELSE '' 6380 END 6381 , 6382 {sql_query_annotation_update_info_sets_sql} 6383 ) 6384 {sql_query_annotation_from_clause} 6385 WHERE {sql_query_annotation_where_clause} 6386 ; 6387 """ 6388 6389 # Add update query to dict 6390 query_dict[ 6391 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6392 ] = sql_query_annotation_chrom_interval_pos 6393 6394 nb_of_query = len(query_dict) 6395 num_query = 0 6396 6397 # SET max_expression_depth TO x 6398 self.conn.execute("SET max_expression_depth TO 10000") 6399 6400 for query_name in query_dict: 6401 query = query_dict[query_name] 6402 num_query += 1 6403 log.info( 6404 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6405 ) 6406 result = self.conn.execute(query) 6407 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6408 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6409 log.info( 6410 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6411 ) 6412 6413 log.info( 6414 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6415 ) 6416 6417 else: 6418 6419 log.info( 6420 f"Annotation '{annotation_name}' - No Annotations available" 6421 ) 6422 6423 log.debug("Final header: " + str(vcf_reader.infos)) 6424 6425 # Remove added columns 6426 for added_column in added_columns: 6427 self.drop_column(column=added_column) 6428 6429 def annotation_splice(self, threads: int = None) -> None: 6430 """ 6431 This function annotate with snpEff 6432 6433 :param threads: The number of threads to use 6434 :return: the value of the variable "return_value". 6435 """ 6436 6437 # DEBUG 6438 log.debug("Start annotation with splice tools") 6439 6440 # Threads 6441 if not threads: 6442 threads = self.get_threads() 6443 log.debug("Threads: " + str(threads)) 6444 6445 # DEBUG 6446 delete_tmp = True 6447 if self.get_config().get("verbosity", "warning") in ["debug"]: 6448 delete_tmp = False 6449 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6450 6451 # Config 6452 config = self.get_config() 6453 log.debug("Config: " + str(config)) 6454 splice_config = config.get("tools", {}).get("splice", {}) 6455 if not splice_config: 6456 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6457 msg_err = "No Splice tool config" 6458 raise ValueError(msg_err) 6459 log.debug(f"splice_config: {splice_config}") 6460 6461 # Config - Folders - Databases 6462 databases_folders = ( 6463 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6464 ) 6465 log.debug("Databases annotations: " + str(databases_folders)) 6466 6467 # Splice docker image 6468 splice_docker_image = splice_config.get("docker").get("image") 6469 6470 # Pull splice image if it's not already there 6471 if not check_docker_image_exists(splice_docker_image): 6472 log.warning( 6473 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6474 ) 6475 try: 6476 command(f"docker pull {splice_config.get('docker').get('image')}") 6477 except subprocess.CalledProcessError: 6478 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6479 log.error(msg_err) 6480 raise ValueError(msg_err) 6481 6482 # Config - splice databases 6483 splice_databases = ( 6484 config.get("folders", {}) 6485 .get("databases", {}) 6486 .get("splice", DEFAULT_SPLICE_FOLDER) 6487 ) 6488 splice_databases = full_path(splice_databases) 6489 6490 # Param 6491 param = self.get_param() 6492 log.debug("Param: " + str(param)) 6493 6494 # Param 6495 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6496 log.debug("Options: " + str(options)) 6497 6498 # Data 6499 table_variants = self.get_table_variants() 6500 6501 # Check if not empty 6502 log.debug("Check if not empty") 6503 sql_query_chromosomes = ( 6504 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6505 ) 6506 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6507 log.info("VCF empty") 6508 return None 6509 6510 # Export in VCF 6511 log.debug("Create initial file to annotate") 6512 6513 # Create output folder / work folder 6514 if options.get("output_folder", ""): 6515 output_folder = options.get("output_folder", "") 6516 if not os.path.exists(output_folder): 6517 Path(output_folder).mkdir(parents=True, exist_ok=True) 6518 else: 6519 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6520 if not os.path.exists(output_folder): 6521 Path(output_folder).mkdir(parents=True, exist_ok=True) 6522 6523 if options.get("workdir", ""): 6524 workdir = options.get("workdir", "") 6525 else: 6526 workdir = "/work" 6527 6528 # Create tmp VCF file 6529 tmp_vcf = NamedTemporaryFile( 6530 prefix=self.get_prefix(), 6531 dir=output_folder, 6532 suffix=".vcf", 6533 delete=False, 6534 ) 6535 tmp_vcf_name = tmp_vcf.name 6536 6537 # VCF header 6538 header = self.get_header() 6539 6540 # Existing annotations 6541 for vcf_annotation in self.get_header().infos: 6542 6543 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6544 log.debug( 6545 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6546 ) 6547 6548 # Memory limit 6549 if config.get("memory", None): 6550 memory_limit = config.get("memory", "8G").upper() 6551 # upper() 6552 else: 6553 memory_limit = "8G" 6554 log.debug(f"memory_limit: {memory_limit}") 6555 6556 # Check number of variants to annotate 6557 where_clause_regex_spliceai = r"SpliceAI_\w+" 6558 where_clause_regex_spip = r"SPiP_\w+" 6559 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6560 df_list_of_variants_to_annotate = self.get_query_to_df( 6561 query=f""" SELECT * FROM variants {where_clause} """ 6562 ) 6563 if len(df_list_of_variants_to_annotate) == 0: 6564 log.warning( 6565 f"No variants to annotate with splice. Variants probably already annotated with splice" 6566 ) 6567 return None 6568 else: 6569 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6570 6571 # Export VCF file 6572 self.export_variant_vcf( 6573 vcf_file=tmp_vcf_name, 6574 remove_info=True, 6575 add_samples=True, 6576 index=False, 6577 where_clause=where_clause, 6578 ) 6579 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6580 if any(value for value in splice_config.values() if value is None): 6581 log.warning("At least one splice config parameter is empty") 6582 # exit annotation_splice 6583 return None 6584 6585 # Params in splice nf 6586 def check_values(dico: dict): 6587 """ 6588 Ensure parameters for NF splice pipeline 6589 """ 6590 for key, val in dico.items(): 6591 if key == "genome": 6592 if any( 6593 assemb in options.get("genome", {}) 6594 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6595 ): 6596 yield f"--{key} hg19" 6597 elif any( 6598 assemb in options.get("genome", {}) 6599 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6600 ): 6601 yield f"--{key} hg38" 6602 elif ( 6603 (isinstance(val, str) and val) 6604 or isinstance(val, int) 6605 or isinstance(val, bool) 6606 ): 6607 yield f"--{key} {val}" 6608 6609 # Genome 6610 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6611 options["genome"] = genome 6612 # NF params 6613 nf_params = [] 6614 # Add options 6615 if options: 6616 log.debug(options) 6617 nf_params = list(check_values(options)) 6618 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6619 else: 6620 log.debug("No NF params provided") 6621 # Add threads 6622 if "threads" not in options.keys(): 6623 nf_params.append(f"--threads {threads}") 6624 # Genome path 6625 genome_path = find_genome( 6626 config.get("folders", {}) 6627 .get("databases", {}) 6628 .get("genomes", DEFAULT_GENOME_FOLDER), 6629 file=f"{genome}.fa", 6630 ) 6631 # Add genome path 6632 if not genome_path: 6633 raise ValueError( 6634 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6635 ) 6636 else: 6637 log.debug(f"Genome: {genome_path}") 6638 nf_params.append(f"--genome_path {genome_path}") 6639 6640 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6641 """ 6642 Setting up updated databases for SPiP and SpliceAI 6643 """ 6644 6645 try: 6646 6647 # SpliceAI assembly transcriptome 6648 spliceai_assembly = os.path.join( 6649 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6650 options.get("genome"), 6651 "transcriptome", 6652 ) 6653 spip_assembly = options.get("genome") 6654 6655 spip = find( 6656 f"transcriptome_{spip_assembly}.RData", 6657 config.get("folders", {}).get("databases", {}).get("spip", {}), 6658 ) 6659 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6660 log.debug(f"SPiP annotations: {spip}") 6661 log.debug(f"SpliceAI annotations: {spliceai}") 6662 if spip and spliceai: 6663 return [ 6664 f"--spip_transcriptome {spip}", 6665 f"--spliceai_transcriptome {spliceai}", 6666 ] 6667 else: 6668 log.warning( 6669 "Can't find splice databases in configuration, use annotations file from image" 6670 ) 6671 except TypeError: 6672 log.warning( 6673 "Can't find splice databases in configuration, use annotations file from image" 6674 ) 6675 return [] 6676 6677 # Add options, check if transcriptome option have already beend provided 6678 if ( 6679 "spip_transcriptome" not in nf_params 6680 and "spliceai_transcriptome" not in nf_params 6681 ): 6682 splice_reference = splice_annotations(options, config) 6683 if splice_reference: 6684 nf_params.extend(splice_reference) 6685 # nf_params.append(f"--output_folder {output_folder}") 6686 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6687 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6688 log.debug(cmd) 6689 splice_config["docker"]["command"] = cmd 6690 6691 # Ensure proxy is set 6692 proxy = [ 6693 f"-e {var}={os.getenv(var)}" 6694 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6695 if os.getenv(var) is not None 6696 ] 6697 docker_cmd = get_bin_command( 6698 tool="splice", 6699 bin_type="docker", 6700 config=config, 6701 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6702 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6703 ) 6704 # print(docker_cmd) 6705 # exit() 6706 # Docker debug 6707 # if splice_config.get("rm_container"): 6708 # rm_container = "--rm" 6709 # else: 6710 # rm_container = "" 6711 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6712 log.debug(docker_cmd) 6713 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6714 log.debug(res.stdout) 6715 if res.stderr: 6716 log.error(res.stderr) 6717 res.check_returncode() 6718 # Update variants 6719 log.info("Annotation - Updating...") 6720 # Test find output vcf 6721 log.debug( 6722 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6723 ) 6724 output_vcf = [] 6725 # Wrong folder to look in 6726 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6727 if ( 6728 files 6729 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6730 ): 6731 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6732 # log.debug(os.listdir(options.get("output_folder"))) 6733 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6734 if not output_vcf: 6735 log.debug( 6736 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6737 ) 6738 else: 6739 # Get new header from annotated vcf 6740 log.debug(f"Initial header: {len(header.infos)} fields") 6741 # Create new header with splice infos 6742 new_vcf = Variants(input=output_vcf[0]) 6743 new_vcf_header = new_vcf.get_header().infos 6744 for keys, infos in new_vcf_header.items(): 6745 if keys not in header.infos.keys(): 6746 header.infos[keys] = infos 6747 log.debug(f"New header: {len(header.infos)} fields") 6748 log.debug(f"Splice tmp output: {output_vcf[0]}") 6749 self.update_from_vcf(output_vcf[0]) 6750 6751 # Remove file 6752 remove_if_exists(output_vcf) 6753 6754 ### 6755 # Prioritization 6756 ### 6757 6758 def get_config_default(self, name: str) -> dict: 6759 """ 6760 The function `get_config_default` returns a dictionary containing default configurations for 6761 various calculations and prioritizations. 6762 6763 :param name: The `get_config_default` function returns a dictionary containing default 6764 configurations for different calculations and prioritizations. The `name` parameter is used to 6765 specify which specific configuration to retrieve from the dictionary 6766 :type name: str 6767 :return: The function `get_config_default` returns a dictionary containing default configuration 6768 settings for different calculations and prioritizations. The specific configuration settings are 6769 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6770 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6771 returned. If there is no match, an empty dictionary is returned. 6772 """ 6773 6774 config_default = { 6775 "calculations": { 6776 "variant_chr_pos_alt_ref": { 6777 "type": "sql", 6778 "name": "variant_chr_pos_alt_ref", 6779 "description": "Create a variant ID with chromosome, position, alt and ref", 6780 "available": False, 6781 "output_column_name": "variant_chr_pos_alt_ref", 6782 "output_column_type": "String", 6783 "output_column_description": "variant ID with chromosome, position, alt and ref", 6784 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6785 "operation_info": True, 6786 }, 6787 "VARTYPE": { 6788 "type": "sql", 6789 "name": "VARTYPE", 6790 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6791 "available": True, 6792 "table": "variants", 6793 "output_column_name": "VARTYPE", 6794 "output_column_type": "String", 6795 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6796 "operation_query": """ 6797 CASE 6798 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6799 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6800 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6801 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6802 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6803 ELSE 'UNDEFINED' 6804 END 6805 """, 6806 "info_fields": ["SVTYPE"], 6807 "operation_info": True, 6808 }, 6809 "snpeff_hgvs": { 6810 "type": "python", 6811 "name": "snpeff_hgvs", 6812 "description": "HGVS nomenclatures from snpEff annotation", 6813 "available": True, 6814 "function_name": "calculation_extract_snpeff_hgvs", 6815 "function_params": ["snpeff_hgvs", "ANN"], 6816 }, 6817 "snpeff_ann_explode": { 6818 "type": "python", 6819 "name": "snpeff_ann_explode", 6820 "description": "Explode snpEff annotations with uniquify values", 6821 "available": True, 6822 "function_name": "calculation_snpeff_ann_explode", 6823 "function_params": [False, "fields", "snpeff_", "ANN"], 6824 }, 6825 "snpeff_ann_explode_uniquify": { 6826 "type": "python", 6827 "name": "snpeff_ann_explode_uniquify", 6828 "description": "Explode snpEff annotations", 6829 "available": True, 6830 "function_name": "calculation_snpeff_ann_explode", 6831 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6832 }, 6833 "snpeff_ann_explode_json": { 6834 "type": "python", 6835 "name": "snpeff_ann_explode_json", 6836 "description": "Explode snpEff annotations in JSON format", 6837 "available": True, 6838 "function_name": "calculation_snpeff_ann_explode", 6839 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6840 }, 6841 "NOMEN": { 6842 "type": "python", 6843 "name": "NOMEN", 6844 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6845 "available": True, 6846 "function_name": "calculation_extract_nomen", 6847 "function_params": [], 6848 }, 6849 "RENAME_INFO_FIELDS": { 6850 "type": "python", 6851 "name": "RENAME_INFO_FIELDS", 6852 "description": "Rename or remove INFO/tags", 6853 "available": True, 6854 "function_name": "calculation_rename_info_fields", 6855 "function_params": [], 6856 }, 6857 "FINDBYPIPELINE": { 6858 "type": "python", 6859 "name": "FINDBYPIPELINE", 6860 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6861 "available": True, 6862 "function_name": "calculation_find_by_pipeline", 6863 "function_params": ["findbypipeline"], 6864 }, 6865 "FINDBYSAMPLE": { 6866 "type": "python", 6867 "name": "FINDBYSAMPLE", 6868 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6869 "available": True, 6870 "function_name": "calculation_find_by_pipeline", 6871 "function_params": ["findbysample"], 6872 }, 6873 "GENOTYPECONCORDANCE": { 6874 "type": "python", 6875 "name": "GENOTYPECONCORDANCE", 6876 "description": "Concordance of genotype for multi caller VCF", 6877 "available": True, 6878 "function_name": "calculation_genotype_concordance", 6879 "function_params": [], 6880 }, 6881 "BARCODE": { 6882 "type": "python", 6883 "name": "BARCODE", 6884 "description": "BARCODE as VaRank tool", 6885 "available": True, 6886 "function_name": "calculation_barcode", 6887 "function_params": [], 6888 }, 6889 "BARCODEFAMILY": { 6890 "type": "python", 6891 "name": "BARCODEFAMILY", 6892 "description": "BARCODEFAMILY as VaRank tool", 6893 "available": True, 6894 "function_name": "calculation_barcode_family", 6895 "function_params": ["BCF"], 6896 }, 6897 "TRIO": { 6898 "type": "python", 6899 "name": "TRIO", 6900 "description": "Inheritance for a trio family", 6901 "available": True, 6902 "function_name": "calculation_trio", 6903 "function_params": [], 6904 }, 6905 "VAF": { 6906 "type": "python", 6907 "name": "VAF", 6908 "description": "Variant Allele Frequency (VAF) harmonization", 6909 "available": True, 6910 "function_name": "calculation_vaf_normalization", 6911 "function_params": [], 6912 }, 6913 "VAF_stats": { 6914 "type": "python", 6915 "name": "VAF_stats", 6916 "description": "Variant Allele Frequency (VAF) statistics", 6917 "available": True, 6918 "function_name": "calculation_genotype_stats", 6919 "function_params": ["VAF"], 6920 }, 6921 "DP_stats": { 6922 "type": "python", 6923 "name": "DP_stats", 6924 "description": "Depth (DP) statistics", 6925 "available": True, 6926 "function_name": "calculation_genotype_stats", 6927 "function_params": ["DP"], 6928 }, 6929 "variant_id": { 6930 "type": "python", 6931 "name": "variant_id", 6932 "description": "Variant ID generated from variant position and type", 6933 "available": True, 6934 "function_name": "calculation_variant_id", 6935 "function_params": [], 6936 }, 6937 "transcripts_json": { 6938 "type": "python", 6939 "name": "transcripts_json", 6940 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6941 "available": True, 6942 "function_name": "calculation_transcripts_annotation", 6943 "function_params": ["transcripts_json", None], 6944 }, 6945 "transcripts_ann": { 6946 "type": "python", 6947 "name": "transcripts_ann", 6948 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6949 "available": True, 6950 "function_name": "calculation_transcripts_annotation", 6951 "function_params": [None, "transcripts_ann"], 6952 }, 6953 "transcripts_annotations": { 6954 "type": "python", 6955 "name": "transcripts_annotations", 6956 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6957 "available": True, 6958 "function_name": "calculation_transcripts_annotation", 6959 "function_params": [None, None], 6960 }, 6961 "transcripts_prioritization": { 6962 "type": "python", 6963 "name": "transcripts_prioritization", 6964 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6965 "available": True, 6966 "function_name": "calculation_transcripts_prioritization", 6967 "function_params": [], 6968 }, 6969 "transcripts_export": { 6970 "type": "python", 6971 "name": "transcripts_export", 6972 "description": "Export transcripts table/view as a file (using param.json)", 6973 "available": True, 6974 "function_name": "calculation_transcripts_export", 6975 "function_params": [], 6976 }, 6977 }, 6978 "prioritizations": { 6979 "default": { 6980 "ANN2": [ 6981 { 6982 "type": "contains", 6983 "value": "HIGH", 6984 "score": 5, 6985 "flag": "PASS", 6986 "comment": [ 6987 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6988 ], 6989 }, 6990 { 6991 "type": "contains", 6992 "value": "MODERATE", 6993 "score": 3, 6994 "flag": "PASS", 6995 "comment": [ 6996 "A non-disruptive variant that might change protein effectiveness" 6997 ], 6998 }, 6999 { 7000 "type": "contains", 7001 "value": "LOW", 7002 "score": 0, 7003 "flag": "FILTERED", 7004 "comment": [ 7005 "Assumed to be mostly harmless or unlikely to change protein behavior" 7006 ], 7007 }, 7008 { 7009 "type": "contains", 7010 "value": "MODIFIER", 7011 "score": 0, 7012 "flag": "FILTERED", 7013 "comment": [ 7014 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7015 ], 7016 }, 7017 ], 7018 } 7019 }, 7020 } 7021 7022 return config_default.get(name, None) 7023 7024 def get_config_json( 7025 self, name: str, config_dict: dict = {}, config_file: str = None 7026 ) -> dict: 7027 """ 7028 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7029 default values, a dictionary, and a file. 7030 7031 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7032 the name of the configuration. It is used to identify and retrieve the configuration settings 7033 for a specific component or module 7034 :type name: str 7035 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7036 dictionary that allows you to provide additional configuration settings or overrides. When you 7037 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7038 the key is the configuration setting you want to override or 7039 :type config_dict: dict 7040 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7041 specify the path to a configuration file that contains additional settings. If provided, the 7042 function will read the contents of this file and update the configuration dictionary with the 7043 values found in the file, overriding any existing values with the 7044 :type config_file: str 7045 :return: The function `get_config_json` returns a dictionary containing the configuration 7046 settings. 7047 """ 7048 7049 # Create with default prioritizations 7050 config_default = self.get_config_default(name=name) 7051 configuration = config_default 7052 # log.debug(f"configuration={configuration}") 7053 7054 # Replace prioritizations from dict 7055 for config in config_dict: 7056 configuration[config] = config_dict[config] 7057 7058 # Replace prioritizations from file 7059 config_file = full_path(config_file) 7060 if config_file: 7061 if os.path.exists(config_file): 7062 with open(config_file) as config_file_content: 7063 config_file_dict = yaml.safe_load(config_file_content) 7064 for config in config_file_dict: 7065 configuration[config] = config_file_dict[config] 7066 else: 7067 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7068 log.error(msg_error) 7069 raise ValueError(msg_error) 7070 7071 return configuration 7072 7073 def prioritization( 7074 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7075 ) -> bool: 7076 """ 7077 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7078 prioritizes variants based on configured profiles and criteria. 7079 7080 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7081 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7082 a table name is provided, the method will prioritize the variants in that specific table 7083 :type table: str 7084 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7085 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7086 provided, the code will use a default prefix value of "PZ" 7087 :type pz_prefix: str 7088 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7089 additional parameters specific to the prioritization process. These parameters can include 7090 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7091 configurations needed for the prioritization of variants in a V 7092 :type pz_param: dict 7093 :return: A boolean value (True) is being returned from the `prioritization` function. 7094 """ 7095 7096 # Config 7097 config = self.get_config() 7098 7099 # Param 7100 param = self.get_param() 7101 7102 # Prioritization param 7103 if pz_param is not None: 7104 prioritization_param = pz_param 7105 else: 7106 prioritization_param = param.get("prioritization", {}) 7107 7108 # Configuration profiles 7109 prioritization_config_file = prioritization_param.get( 7110 "prioritization_config", None 7111 ) 7112 prioritization_config_file = full_path(prioritization_config_file) 7113 prioritizations_config = self.get_config_json( 7114 name="prioritizations", config_file=prioritization_config_file 7115 ) 7116 7117 # Prioritization prefix 7118 pz_prefix_default = "PZ" 7119 if pz_prefix is None: 7120 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7121 7122 # Prioritization options 7123 profiles = prioritization_param.get("profiles", []) 7124 if isinstance(profiles, str): 7125 profiles = profiles.split(",") 7126 pzfields = prioritization_param.get( 7127 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7128 ) 7129 if isinstance(pzfields, str): 7130 pzfields = pzfields.split(",") 7131 default_profile = prioritization_param.get("default_profile", None) 7132 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7133 prioritization_score_mode = prioritization_param.get( 7134 "prioritization_score_mode", "HOWARD" 7135 ) 7136 7137 # Quick Prioritizations 7138 prioritizations = param.get("prioritizations", None) 7139 if prioritizations: 7140 log.info("Quick Prioritization:") 7141 for profile in prioritizations.split(","): 7142 if profile not in profiles: 7143 profiles.append(profile) 7144 log.info(f" {profile}") 7145 7146 # If profile "ALL" provided, all profiles in the config profiles 7147 if "ALL" in profiles: 7148 profiles = list(prioritizations_config.keys()) 7149 7150 for profile in profiles: 7151 if prioritizations_config.get(profile, None): 7152 log.debug(f"Profile '{profile}' configured") 7153 else: 7154 msg_error = f"Profile '{profile}' NOT configured" 7155 log.error(msg_error) 7156 raise ValueError(msg_error) 7157 7158 if profiles: 7159 log.info(f"Prioritization... ") 7160 else: 7161 log.debug(f"No profile defined") 7162 return False 7163 7164 if not default_profile and len(profiles): 7165 default_profile = profiles[0] 7166 7167 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7168 log.debug("Profiles to check: " + str(list(profiles))) 7169 7170 # Variables 7171 if table is not None: 7172 table_variants = table 7173 else: 7174 table_variants = self.get_table_variants(clause="update") 7175 log.debug(f"Table to prioritize: {table_variants}") 7176 7177 # Added columns 7178 added_columns = [] 7179 7180 # Create list of PZfields 7181 # List of PZFields 7182 list_of_pzfields_original = pzfields + [ 7183 pzfield + pzfields_sep + profile 7184 for pzfield in pzfields 7185 for profile in profiles 7186 ] 7187 list_of_pzfields = [] 7188 log.debug(f"{list_of_pzfields_original}") 7189 7190 # Remove existing PZfields to use if exists 7191 for pzfield in list_of_pzfields_original: 7192 if self.get_header().infos.get(pzfield, None) is None: 7193 list_of_pzfields.append(pzfield) 7194 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7195 else: 7196 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7197 7198 if list_of_pzfields: 7199 7200 # Explode Infos prefix 7201 explode_infos_prefix = self.get_explode_infos_prefix() 7202 7203 # PZfields tags description 7204 PZfields_INFOS = { 7205 f"{pz_prefix}Tags": { 7206 "ID": f"{pz_prefix}Tags", 7207 "Number": ".", 7208 "Type": "String", 7209 "Description": "Variant tags based on annotation criteria", 7210 }, 7211 f"{pz_prefix}Score": { 7212 "ID": f"{pz_prefix}Score", 7213 "Number": 1, 7214 "Type": "Integer", 7215 "Description": "Variant score based on annotation criteria", 7216 }, 7217 f"{pz_prefix}Flag": { 7218 "ID": f"{pz_prefix}Flag", 7219 "Number": 1, 7220 "Type": "String", 7221 "Description": "Variant flag based on annotation criteria", 7222 }, 7223 f"{pz_prefix}Comment": { 7224 "ID": f"{pz_prefix}Comment", 7225 "Number": ".", 7226 "Type": "String", 7227 "Description": "Variant comment based on annotation criteria", 7228 }, 7229 f"{pz_prefix}Infos": { 7230 "ID": f"{pz_prefix}Infos", 7231 "Number": ".", 7232 "Type": "String", 7233 "Description": "Variant infos based on annotation criteria", 7234 }, 7235 f"{pz_prefix}Class": { 7236 "ID": f"{pz_prefix}Class", 7237 "Number": ".", 7238 "Type": "String", 7239 "Description": "Variant class based on annotation criteria", 7240 }, 7241 } 7242 7243 # Create INFO fields if not exist 7244 for field in PZfields_INFOS: 7245 field_ID = PZfields_INFOS[field]["ID"] 7246 field_description = PZfields_INFOS[field]["Description"] 7247 if field_ID not in self.get_header().infos and field_ID in pzfields: 7248 field_description = ( 7249 PZfields_INFOS[field]["Description"] 7250 + f", profile {default_profile}" 7251 ) 7252 self.get_header().infos[field_ID] = vcf.parser._Info( 7253 field_ID, 7254 PZfields_INFOS[field]["Number"], 7255 PZfields_INFOS[field]["Type"], 7256 field_description, 7257 "unknown", 7258 "unknown", 7259 code_type_map[PZfields_INFOS[field]["Type"]], 7260 ) 7261 7262 # Create INFO fields if not exist for each profile 7263 for profile in prioritizations_config: 7264 if profile in profiles or profiles == []: 7265 for field in PZfields_INFOS: 7266 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7267 field_description = ( 7268 PZfields_INFOS[field]["Description"] 7269 + f", profile {profile}" 7270 ) 7271 if ( 7272 field_ID not in self.get_header().infos 7273 and field in pzfields 7274 ): 7275 self.get_header().infos[field_ID] = vcf.parser._Info( 7276 field_ID, 7277 PZfields_INFOS[field]["Number"], 7278 PZfields_INFOS[field]["Type"], 7279 field_description, 7280 "unknown", 7281 "unknown", 7282 code_type_map[PZfields_INFOS[field]["Type"]], 7283 ) 7284 7285 # Header 7286 for pzfield in list_of_pzfields: 7287 if re.match(f"{pz_prefix}Score.*", pzfield): 7288 added_column = self.add_column( 7289 table_name=table_variants, 7290 column_name=pzfield, 7291 column_type="INTEGER", 7292 default_value="0", 7293 ) 7294 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7295 added_column = self.add_column( 7296 table_name=table_variants, 7297 column_name=pzfield, 7298 column_type="BOOLEAN", 7299 default_value="1", 7300 ) 7301 elif re.match(f"{pz_prefix}Class.*", pzfield): 7302 added_column = self.add_column( 7303 table_name=table_variants, 7304 column_name=pzfield, 7305 column_type="VARCHAR[]", 7306 default_value="null", 7307 ) 7308 else: 7309 added_column = self.add_column( 7310 table_name=table_variants, 7311 column_name=pzfield, 7312 column_type="STRING", 7313 default_value="''", 7314 ) 7315 added_columns.append(added_column) 7316 7317 # Profiles 7318 if profiles: 7319 7320 # foreach profile in configuration file 7321 for profile in prioritizations_config: 7322 7323 # If profile is asked in param, or ALL are asked (empty profile []) 7324 if profile in profiles or profiles == []: 7325 log.info(f"Profile '{profile}'") 7326 7327 sql_set_info_option = "" 7328 7329 sql_set_info = [] 7330 7331 # PZ fields set 7332 7333 # PZScore 7334 if ( 7335 f"{pz_prefix}Score{pzfields_sep}{profile}" 7336 in list_of_pzfields 7337 ): 7338 sql_set_info.append( 7339 f""" 7340 concat( 7341 '{pz_prefix}Score{pzfields_sep}{profile}=', 7342 {pz_prefix}Score{pzfields_sep}{profile} 7343 ) 7344 """ 7345 ) 7346 if ( 7347 profile == default_profile 7348 and f"{pz_prefix}Score" in list_of_pzfields 7349 ): 7350 sql_set_info.append( 7351 f""" 7352 concat( 7353 '{pz_prefix}Score=', 7354 {pz_prefix}Score{pzfields_sep}{profile} 7355 ) 7356 """ 7357 ) 7358 7359 # PZFlag 7360 if ( 7361 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7362 in list_of_pzfields 7363 ): 7364 sql_set_info.append( 7365 f""" 7366 concat( 7367 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7368 CASE 7369 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7370 THEN 'PASS' 7371 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7372 THEN 'FILTERED' 7373 END 7374 ) 7375 """ 7376 ) 7377 if ( 7378 profile == default_profile 7379 and f"{pz_prefix}Flag" in list_of_pzfields 7380 ): 7381 sql_set_info.append( 7382 f""" 7383 concat( 7384 '{pz_prefix}Flag=', 7385 CASE 7386 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7387 THEN 'PASS' 7388 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7389 THEN 'FILTERED' 7390 END 7391 ) 7392 """ 7393 ) 7394 7395 # PZClass 7396 if ( 7397 f"{pz_prefix}Class{pzfields_sep}{profile}" 7398 in list_of_pzfields 7399 ): 7400 sql_set_info.append( 7401 f""" 7402 concat( 7403 '{pz_prefix}Class{pzfields_sep}{profile}=', 7404 CASE 7405 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7406 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7407 ELSE '.' 7408 END 7409 ) 7410 7411 """ 7412 ) 7413 if ( 7414 profile == default_profile 7415 and f"{pz_prefix}Class" in list_of_pzfields 7416 ): 7417 sql_set_info.append( 7418 f""" 7419 concat( 7420 '{pz_prefix}Class=', 7421 CASE 7422 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7423 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7424 ELSE '.' 7425 END 7426 ) 7427 """ 7428 ) 7429 7430 # PZComment 7431 if ( 7432 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7433 in list_of_pzfields 7434 ): 7435 sql_set_info.append( 7436 f""" 7437 CASE 7438 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7439 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7440 ELSE '' 7441 END 7442 """ 7443 ) 7444 if ( 7445 profile == default_profile 7446 and f"{pz_prefix}Comment" in list_of_pzfields 7447 ): 7448 sql_set_info.append( 7449 f""" 7450 CASE 7451 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7452 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7453 ELSE '' 7454 END 7455 """ 7456 ) 7457 7458 # PZInfos 7459 if ( 7460 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7461 in list_of_pzfields 7462 ): 7463 sql_set_info.append( 7464 f""" 7465 CASE 7466 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7467 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7468 ELSE '' 7469 END 7470 """ 7471 ) 7472 if ( 7473 profile == default_profile 7474 and f"{pz_prefix}Infos" in list_of_pzfields 7475 ): 7476 sql_set_info.append( 7477 f""" 7478 CASE 7479 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7480 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7481 ELSE '' 7482 END 7483 """ 7484 ) 7485 7486 # Merge PZfields 7487 sql_set_info_option = "" 7488 sql_set_sep = "" 7489 for sql_set in sql_set_info: 7490 if sql_set_sep: 7491 sql_set_info_option += f""" 7492 , concat('{sql_set_sep}', {sql_set}) 7493 """ 7494 else: 7495 sql_set_info_option += f""" 7496 , {sql_set} 7497 """ 7498 sql_set_sep = ";" 7499 7500 sql_queries = [] 7501 for annotation in prioritizations_config[profile]: 7502 7503 # skip special sections 7504 if annotation.startswith("_"): 7505 continue 7506 7507 # For each criterions 7508 for criterion in prioritizations_config[profile][ 7509 annotation 7510 ]: 7511 7512 # Criterion mode 7513 criterion_mode = None 7514 if np.any( 7515 np.isin(list(criterion.keys()), ["type", "value"]) 7516 ): 7517 criterion_mode = "operation" 7518 elif np.any( 7519 np.isin(list(criterion.keys()), ["sql", "fields"]) 7520 ): 7521 criterion_mode = "sql" 7522 log.debug(f"Criterion Mode: {criterion_mode}") 7523 7524 # Criterion parameters 7525 criterion_type = criterion.get("type", None) 7526 criterion_value = criterion.get("value", None) 7527 criterion_sql = criterion.get("sql", None) 7528 criterion_fields = criterion.get("fields", None) 7529 criterion_score = criterion.get("score", 0) 7530 criterion_flag = criterion.get("flag", "PASS") 7531 criterion_class = criterion.get("class", None) 7532 criterion_flag_bool = criterion_flag == "PASS" 7533 criterion_comment = ( 7534 ", ".join(criterion.get("comment", [])) 7535 .replace("'", "''") 7536 .replace(";", ",") 7537 .replace("\t", " ") 7538 ) 7539 criterion_infos = ( 7540 str(criterion) 7541 .replace("'", "''") 7542 .replace(";", ",") 7543 .replace("\t", " ") 7544 ) 7545 7546 # SQL 7547 if criterion_sql is not None and isinstance( 7548 criterion_sql, list 7549 ): 7550 criterion_sql = " ".join(criterion_sql) 7551 7552 # Fields and explode 7553 if criterion_fields is None: 7554 criterion_fields = [annotation] 7555 if not isinstance(criterion_fields, list): 7556 criterion_fields = str(criterion_fields).split(",") 7557 7558 # Class 7559 if criterion_class is not None and not isinstance( 7560 criterion_class, list 7561 ): 7562 criterion_class = str(criterion_class).split(",") 7563 7564 for annotation_field in criterion_fields: 7565 7566 # Explode specific annotation 7567 log.debug( 7568 f"Explode annotation '{annotation_field}'" 7569 ) 7570 added_columns += self.explode_infos( 7571 prefix=explode_infos_prefix, 7572 fields=[annotation_field], 7573 table=table_variants, 7574 ) 7575 extra_infos = self.get_extra_infos( 7576 table=table_variants 7577 ) 7578 7579 # Check if annotation field is present 7580 if ( 7581 f"{explode_infos_prefix}{annotation_field}" 7582 not in extra_infos 7583 ): 7584 msq_err = f"Annotation '{annotation_field}' not in data" 7585 log.error(msq_err) 7586 raise ValueError(msq_err) 7587 else: 7588 log.debug( 7589 f"Annotation '{annotation_field}' in data" 7590 ) 7591 7592 sql_set = [] 7593 sql_set_info = [] 7594 7595 # PZ fields set 7596 7597 # PZScore 7598 if ( 7599 f"{pz_prefix}Score{pzfields_sep}{profile}" 7600 in list_of_pzfields 7601 ): 7602 # VaRank prioritization score mode 7603 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7604 sql_set.append( 7605 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7606 ) 7607 # default HOWARD prioritization score mode 7608 else: 7609 sql_set.append( 7610 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7611 ) 7612 7613 # PZFlag 7614 if ( 7615 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7616 in list_of_pzfields 7617 ): 7618 sql_set.append( 7619 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7620 ) 7621 7622 # PZClass 7623 if ( 7624 f"{pz_prefix}Class{pzfields_sep}{profile}" 7625 in list_of_pzfields 7626 and criterion_class is not None 7627 ): 7628 sql_set.append( 7629 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7630 ) 7631 7632 # PZComment 7633 if ( 7634 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7635 in list_of_pzfields 7636 ): 7637 sql_set.append( 7638 f""" 7639 {pz_prefix}Comment{pzfields_sep}{profile} = 7640 concat( 7641 {pz_prefix}Comment{pzfields_sep}{profile}, 7642 CASE 7643 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7644 THEN ', ' 7645 ELSE '' 7646 END, 7647 '{criterion_comment}' 7648 ) 7649 """ 7650 ) 7651 7652 # PZInfos 7653 if ( 7654 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7655 in list_of_pzfields 7656 ): 7657 sql_set.append( 7658 f""" 7659 {pz_prefix}Infos{pzfields_sep}{profile} = 7660 concat( 7661 {pz_prefix}Infos{pzfields_sep}{profile}, 7662 '{criterion_infos}' 7663 ) 7664 """ 7665 ) 7666 sql_set_option = ",".join(sql_set) 7667 7668 # Criterion and comparison 7669 if sql_set_option: 7670 7671 if criterion_mode in ["operation"]: 7672 7673 try: 7674 float(criterion_value) 7675 sql_update = f""" 7676 UPDATE {table_variants} 7677 SET {sql_set_option} 7678 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7679 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7680 """ 7681 except: 7682 contains_option = "" 7683 if criterion_type == "contains": 7684 contains_option = ".*" 7685 sql_update = f""" 7686 UPDATE {table_variants} 7687 SET {sql_set_option} 7688 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7689 """ 7690 sql_queries.append(sql_update) 7691 7692 elif criterion_mode in ["sql"]: 7693 7694 sql_update = f""" 7695 UPDATE {table_variants} 7696 SET {sql_set_option} 7697 WHERE {criterion_sql} 7698 """ 7699 sql_queries.append(sql_update) 7700 7701 else: 7702 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7703 log.error(msg_err) 7704 raise ValueError(msg_err) 7705 7706 else: 7707 log.warning( 7708 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7709 ) 7710 7711 # PZTags 7712 if ( 7713 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7714 in list_of_pzfields 7715 ): 7716 7717 # Create PZFalgs value 7718 pztags_value = "" 7719 pztags_sep_default = "," 7720 pztags_sep = "" 7721 for pzfield in pzfields: 7722 if pzfield not in [f"{pz_prefix}Tags"]: 7723 if ( 7724 f"{pzfield}{pzfields_sep}{profile}" 7725 in list_of_pzfields 7726 ): 7727 if pzfield in [f"{pz_prefix}Flag"]: 7728 pztags_value += f"""{pztags_sep}{pzfield}#', 7729 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7730 THEN 'PASS' 7731 ELSE 'FILTERED' 7732 END, '""" 7733 elif pzfield in [f"{pz_prefix}Class"]: 7734 pztags_value += f"""{pztags_sep}{pzfield}#', 7735 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7736 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7737 ELSE '.' 7738 END, '""" 7739 else: 7740 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7741 pztags_sep = pztags_sep_default 7742 7743 # Add Query update for PZFlags 7744 sql_update_pztags = f""" 7745 UPDATE {table_variants} 7746 SET INFO = concat( 7747 INFO, 7748 CASE WHEN INFO NOT in ('','.') 7749 THEN ';' 7750 ELSE '' 7751 END, 7752 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7753 ) 7754 """ 7755 sql_queries.append(sql_update_pztags) 7756 7757 # Add Query update for PZFlags for default 7758 if profile == default_profile: 7759 sql_update_pztags_default = f""" 7760 UPDATE {table_variants} 7761 SET INFO = concat( 7762 INFO, 7763 ';', 7764 '{pz_prefix}Tags={pztags_value}' 7765 ) 7766 """ 7767 sql_queries.append(sql_update_pztags_default) 7768 7769 log.info(f"""Profile '{profile}' - Prioritization... """) 7770 7771 if sql_queries: 7772 7773 for sql_query in sql_queries: 7774 log.debug( 7775 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7776 ) 7777 self.conn.execute(sql_query) 7778 7779 log.info(f"""Profile '{profile}' - Update... """) 7780 sql_query_update = f""" 7781 UPDATE {table_variants} 7782 SET INFO = 7783 concat( 7784 CASE 7785 WHEN INFO NOT IN ('','.') 7786 THEN concat(INFO, ';') 7787 ELSE '' 7788 END 7789 {sql_set_info_option} 7790 ) 7791 """ 7792 self.conn.execute(sql_query_update) 7793 7794 else: 7795 7796 log.warning(f"No profiles in parameters") 7797 7798 # Remove added columns 7799 for added_column in added_columns: 7800 self.drop_column(column=added_column) 7801 7802 # Explode INFOS fields into table fields 7803 if self.get_explode_infos(): 7804 self.explode_infos( 7805 prefix=self.get_explode_infos_prefix(), 7806 fields=self.get_explode_infos_fields(), 7807 force=True, 7808 ) 7809 7810 return True 7811 7812 ### 7813 # HGVS 7814 ### 7815 7816 def annotation_hgvs(self, threads: int = None) -> None: 7817 """ 7818 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7819 coordinates and alleles. 7820 7821 :param threads: The `threads` parameter is an optional integer that specifies the number of 7822 threads to use for parallel processing. If no value is provided, it will default to the number 7823 of threads obtained from the `get_threads()` method 7824 :type threads: int 7825 """ 7826 7827 # Function for each partition of the Dask Dataframe 7828 def partition_function(partition): 7829 """ 7830 The function `partition_function` applies the `annotation_hgvs_partition` function to 7831 each row of a DataFrame called `partition`. 7832 7833 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7834 to be processed 7835 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7836 the "partition" dataframe along the axis 1. 7837 """ 7838 return partition.apply(annotation_hgvs_partition, axis=1) 7839 7840 def annotation_hgvs_partition(row) -> str: 7841 """ 7842 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7843 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7844 7845 :param row: A dictionary-like object that contains the values for the following keys: 7846 :return: a string that contains the HGVS names associated with the given row of data. 7847 """ 7848 7849 chr = row["CHROM"] 7850 pos = row["POS"] 7851 ref = row["REF"] 7852 alt = row["ALT"] 7853 7854 # Find list of associated transcripts 7855 transcripts_list = list( 7856 polars_conn.execute( 7857 f""" 7858 SELECT transcript 7859 FROM refseq_df 7860 WHERE CHROM='{chr}' 7861 AND POS={pos} 7862 """ 7863 )["transcript"] 7864 ) 7865 7866 # Full HGVS annotation in list 7867 hgvs_full_list = [] 7868 7869 for transcript_name in transcripts_list: 7870 7871 # Transcript 7872 transcript = get_transcript( 7873 transcripts=transcripts, transcript_name=transcript_name 7874 ) 7875 # Exon 7876 if use_exon: 7877 exon = transcript.find_exon_number(pos) 7878 else: 7879 exon = None 7880 # Protein 7881 transcript_protein = None 7882 if use_protein or add_protein or full_format: 7883 transcripts_protein = list( 7884 polars_conn.execute( 7885 f""" 7886 SELECT protein 7887 FROM refseqlink_df 7888 WHERE transcript='{transcript_name}' 7889 LIMIT 1 7890 """ 7891 )["protein"] 7892 ) 7893 if len(transcripts_protein): 7894 transcript_protein = transcripts_protein[0] 7895 7896 # HGVS name 7897 hgvs_name = format_hgvs_name( 7898 chr, 7899 pos, 7900 ref, 7901 alt, 7902 genome=genome, 7903 transcript=transcript, 7904 transcript_protein=transcript_protein, 7905 exon=exon, 7906 use_gene=use_gene, 7907 use_protein=use_protein, 7908 full_format=full_format, 7909 use_version=use_version, 7910 codon_type=codon_type, 7911 ) 7912 hgvs_full_list.append(hgvs_name) 7913 if add_protein and not use_protein and not full_format: 7914 hgvs_name = format_hgvs_name( 7915 chr, 7916 pos, 7917 ref, 7918 alt, 7919 genome=genome, 7920 transcript=transcript, 7921 transcript_protein=transcript_protein, 7922 exon=exon, 7923 use_gene=use_gene, 7924 use_protein=True, 7925 full_format=False, 7926 use_version=use_version, 7927 codon_type=codon_type, 7928 ) 7929 hgvs_full_list.append(hgvs_name) 7930 7931 # Create liste of HGVS annotations 7932 hgvs_full = ",".join(hgvs_full_list) 7933 7934 return hgvs_full 7935 7936 # Polars connexion 7937 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7938 7939 # Config 7940 config = self.get_config() 7941 7942 # Databases 7943 # Genome 7944 databases_genomes_folders = ( 7945 config.get("folders", {}) 7946 .get("databases", {}) 7947 .get("genomes", DEFAULT_GENOME_FOLDER) 7948 ) 7949 databases_genome = ( 7950 config.get("folders", {}).get("databases", {}).get("genomes", "") 7951 ) 7952 # refseq database folder 7953 databases_refseq_folders = ( 7954 config.get("folders", {}) 7955 .get("databases", {}) 7956 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7957 ) 7958 # refseq 7959 databases_refseq = config.get("databases", {}).get("refSeq", None) 7960 # refSeqLink 7961 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7962 7963 # Param 7964 param = self.get_param() 7965 7966 # Quick HGVS 7967 if "hgvs_options" in param and param.get("hgvs_options", ""): 7968 log.info(f"Quick HGVS Annotation:") 7969 if not param.get("hgvs", None): 7970 param["hgvs"] = {} 7971 for option in param.get("hgvs_options", "").split(","): 7972 option_var_val = option.split("=") 7973 option_var = option_var_val[0] 7974 if len(option_var_val) > 1: 7975 option_val = option_var_val[1] 7976 else: 7977 option_val = "True" 7978 if option_val.upper() in ["TRUE"]: 7979 option_val = True 7980 elif option_val.upper() in ["FALSE"]: 7981 option_val = False 7982 log.info(f" {option_var}={option_val}") 7983 param["hgvs"][option_var] = option_val 7984 7985 # Check if HGVS annotation enabled 7986 if "hgvs" in param: 7987 log.info(f"HGVS Annotation... ") 7988 for hgvs_option in param.get("hgvs", {}): 7989 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7990 else: 7991 return 7992 7993 # HGVS Param 7994 param_hgvs = param.get("hgvs", {}) 7995 use_exon = param_hgvs.get("use_exon", False) 7996 use_gene = param_hgvs.get("use_gene", False) 7997 use_protein = param_hgvs.get("use_protein", False) 7998 add_protein = param_hgvs.get("add_protein", False) 7999 full_format = param_hgvs.get("full_format", False) 8000 use_version = param_hgvs.get("use_version", False) 8001 codon_type = param_hgvs.get("codon_type", "3") 8002 8003 # refSseq refSeqLink 8004 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8005 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8006 8007 # Assembly 8008 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8009 8010 # Genome 8011 genome_file = None 8012 if find_genome(databases_genome): 8013 genome_file = find_genome(databases_genome) 8014 else: 8015 genome_file = find_genome( 8016 genome_path=databases_genomes_folders, assembly=assembly 8017 ) 8018 log.debug("Genome: " + str(genome_file)) 8019 8020 # refSseq 8021 refseq_file = find_file_prefix( 8022 input_file=databases_refseq, 8023 prefix="ncbiRefSeq", 8024 folder=databases_refseq_folders, 8025 assembly=assembly, 8026 ) 8027 log.debug("refSeq: " + str(refseq_file)) 8028 8029 # refSeqLink 8030 refseqlink_file = find_file_prefix( 8031 input_file=databases_refseqlink, 8032 prefix="ncbiRefSeqLink", 8033 folder=databases_refseq_folders, 8034 assembly=assembly, 8035 ) 8036 log.debug("refSeqLink: " + str(refseqlink_file)) 8037 8038 # Threads 8039 if not threads: 8040 threads = self.get_threads() 8041 log.debug("Threads: " + str(threads)) 8042 8043 # Variables 8044 table_variants = self.get_table_variants(clause="update") 8045 8046 # Get variants SNV and InDel only 8047 query_variants = f""" 8048 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8049 FROM {table_variants} 8050 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8051 """ 8052 df_variants = self.get_query_to_df(query_variants) 8053 8054 # Added columns 8055 added_columns = [] 8056 8057 # Add hgvs column in variants table 8058 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8059 added_column = self.add_column( 8060 table_variants, hgvs_column_name, "STRING", default_value=None 8061 ) 8062 added_columns.append(added_column) 8063 8064 log.debug(f"refSeq loading...") 8065 # refSeq in duckDB 8066 refseq_table = get_refseq_table( 8067 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8068 ) 8069 # Loading all refSeq in Dataframe 8070 refseq_query = f""" 8071 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8072 FROM {refseq_table} 8073 JOIN df_variants ON ( 8074 {refseq_table}.chrom = df_variants.CHROM 8075 AND {refseq_table}.txStart<=df_variants.POS 8076 AND {refseq_table}.txEnd>=df_variants.POS 8077 ) 8078 """ 8079 refseq_df = self.conn.query(refseq_query).pl() 8080 8081 if refseqlink_file: 8082 log.debug(f"refSeqLink loading...") 8083 # refSeqLink in duckDB 8084 refseqlink_table = get_refseq_table( 8085 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8086 ) 8087 # Loading all refSeqLink in Dataframe 8088 protacc_column = "protAcc_with_ver" 8089 mrnaacc_column = "mrnaAcc_with_ver" 8090 refseqlink_query = f""" 8091 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8092 FROM {refseqlink_table} 8093 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8094 WHERE protAcc_without_ver IS NOT NULL 8095 """ 8096 # Polars Dataframe 8097 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8098 8099 # Read RefSeq transcripts into a python dict/model. 8100 log.debug(f"Transcripts loading...") 8101 with tempfile.TemporaryDirectory() as tmpdir: 8102 transcripts_query = f""" 8103 COPY ( 8104 SELECT {refseq_table}.* 8105 FROM {refseq_table} 8106 JOIN df_variants ON ( 8107 {refseq_table}.chrom=df_variants.CHROM 8108 AND {refseq_table}.txStart<=df_variants.POS 8109 AND {refseq_table}.txEnd>=df_variants.POS 8110 ) 8111 ) 8112 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8113 """ 8114 self.conn.query(transcripts_query) 8115 with open(f"{tmpdir}/transcript.tsv") as infile: 8116 transcripts = read_transcripts(infile) 8117 8118 # Polars connexion 8119 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8120 8121 log.debug("Genome loading...") 8122 # Read genome sequence using pyfaidx. 8123 genome = Fasta(genome_file) 8124 8125 log.debug("Start annotation HGVS...") 8126 8127 # Create 8128 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8129 ddf = dd.from_pandas(df_variants, npartitions=threads) 8130 8131 # Use dask.dataframe.apply() to apply function on each partition 8132 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8133 8134 # Convert Dask DataFrame to Pandas Dataframe 8135 df = ddf.compute() 8136 8137 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8138 with tempfile.TemporaryDirectory() as tmpdir: 8139 df_parquet = os.path.join(tmpdir, "df.parquet") 8140 df.to_parquet(df_parquet) 8141 8142 # Update hgvs column 8143 update_variant_query = f""" 8144 UPDATE {table_variants} 8145 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8146 FROM read_parquet('{df_parquet}') as df 8147 WHERE variants."#CHROM" = df.CHROM 8148 AND variants.POS = df.POS 8149 AND variants.REF = df.REF 8150 AND variants.ALT = df.ALT 8151 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8152 """ 8153 self.execute_query(update_variant_query) 8154 8155 # Update INFO column 8156 sql_query_update = f""" 8157 UPDATE {table_variants} 8158 SET INFO = 8159 concat( 8160 CASE 8161 WHEN INFO NOT IN ('','.') 8162 THEN concat(INFO, ';') 8163 ELSE '' 8164 END, 8165 'hgvs=', 8166 {hgvs_column_name} 8167 ) 8168 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8169 """ 8170 self.execute_query(sql_query_update) 8171 8172 # Add header 8173 HGVS_INFOS = { 8174 "hgvs": { 8175 "ID": "hgvs", 8176 "Number": ".", 8177 "Type": "String", 8178 "Description": f"HGVS annotatation with HOWARD", 8179 } 8180 } 8181 8182 for field in HGVS_INFOS: 8183 field_ID = HGVS_INFOS[field]["ID"] 8184 field_description = HGVS_INFOS[field]["Description"] 8185 self.get_header().infos[field_ID] = vcf.parser._Info( 8186 field_ID, 8187 HGVS_INFOS[field]["Number"], 8188 HGVS_INFOS[field]["Type"], 8189 field_description, 8190 "unknown", 8191 "unknown", 8192 code_type_map[HGVS_INFOS[field]["Type"]], 8193 ) 8194 8195 # Remove added columns 8196 for added_column in added_columns: 8197 self.drop_column(column=added_column) 8198 8199 ### 8200 # Calculation 8201 ### 8202 8203 def get_operations_help( 8204 self, operations_config_dict: dict = {}, operations_config_file: str = None 8205 ) -> list: 8206 8207 # Init 8208 operations_help = [] 8209 8210 # operations 8211 operations = self.get_config_json( 8212 name="calculations", 8213 config_dict=operations_config_dict, 8214 config_file=operations_config_file, 8215 ) 8216 for op in operations: 8217 op_name = operations[op].get("name", op).upper() 8218 op_description = operations[op].get("description", op_name) 8219 op_available = operations[op].get("available", False) 8220 if op_available: 8221 operations_help.append(f" {op_name}: {op_description}") 8222 8223 # Sort operations 8224 operations_help.sort() 8225 8226 # insert header 8227 operations_help.insert(0, "Available calculation operations:") 8228 8229 # Return 8230 return operations_help 8231 8232 def calculation( 8233 self, 8234 operations: dict = {}, 8235 operations_config_dict: dict = {}, 8236 operations_config_file: str = None, 8237 ) -> None: 8238 """ 8239 It takes a list of operations, and for each operation, it checks if it's a python or sql 8240 operation, and then calls the appropriate function 8241 8242 param json example: 8243 "calculation": { 8244 "NOMEN": { 8245 "options": { 8246 "hgvs_field": "hgvs" 8247 }, 8248 "middle" : null 8249 } 8250 """ 8251 8252 # Param 8253 param = self.get_param() 8254 8255 # CHeck operations config file 8256 if operations_config_file is None: 8257 operations_config_file = param.get("calculation", {}).get( 8258 "calculation_config", None 8259 ) 8260 8261 # operations config 8262 operations_config = self.get_config_json( 8263 name="calculations", 8264 config_dict=operations_config_dict, 8265 config_file=operations_config_file, 8266 ) 8267 8268 # Upper keys 8269 operations_config = {k.upper(): v for k, v in operations_config.items()} 8270 8271 # Calculations 8272 8273 # Operations from param 8274 operations = param.get("calculation", {}).get("calculations", operations) 8275 8276 # Quick calculation - add 8277 if param.get("calculations", None): 8278 8279 # List of operations 8280 calculations_list = [ 8281 value.strip() for value in param.get("calculations", "").split(",") 8282 ] 8283 8284 # Log 8285 log.info(f"Quick Calculations:") 8286 for calculation_key in calculations_list: 8287 log.info(f" {calculation_key}") 8288 8289 # Create tmp operations (to keep operation order) 8290 operations_tmp = {} 8291 for calculation_operation in calculations_list: 8292 if calculation_operation.upper() not in operations_tmp: 8293 log.debug( 8294 f"{calculation_operation}.upper() not in {operations_tmp}" 8295 ) 8296 operations_tmp[calculation_operation.upper()] = {} 8297 add_value_into_dict( 8298 dict_tree=operations_tmp, 8299 sections=[ 8300 calculation_operation.upper(), 8301 ], 8302 value=operations.get(calculation_operation.upper(), {}), 8303 ) 8304 # Add operations already in param 8305 for calculation_operation in operations: 8306 if calculation_operation not in operations_tmp: 8307 operations_tmp[calculation_operation] = operations.get( 8308 calculation_operation, {} 8309 ) 8310 8311 # Update operations in param 8312 operations = operations_tmp 8313 8314 # Operations for calculation 8315 if not operations: 8316 operations = param.get("calculation", {}).get("calculations", {}) 8317 8318 if operations: 8319 log.info(f"Calculations...") 8320 8321 # For each operations 8322 for operation_name in operations: 8323 operation_name = operation_name.upper() 8324 if operation_name not in [""]: 8325 if operation_name in operations_config: 8326 log.info(f"Calculation '{operation_name}'") 8327 operation = operations_config[operation_name] 8328 operation_type = operation.get("type", "sql") 8329 if operation_type == "python": 8330 self.calculation_process_function( 8331 operation=operation, operation_name=operation_name 8332 ) 8333 elif operation_type == "sql": 8334 self.calculation_process_sql( 8335 operation=operation, operation_name=operation_name 8336 ) 8337 else: 8338 log.error( 8339 f"Operations config: Type '{operation_type}' NOT available" 8340 ) 8341 raise ValueError( 8342 f"Operations config: Type '{operation_type}' NOT available" 8343 ) 8344 else: 8345 log.error( 8346 f"Operations config: Calculation '{operation_name}' NOT available" 8347 ) 8348 raise ValueError( 8349 f"Operations config: Calculation '{operation_name}' NOT available" 8350 ) 8351 8352 # Explode INFOS fields into table fields 8353 if self.get_explode_infos(): 8354 self.explode_infos( 8355 prefix=self.get_explode_infos_prefix(), 8356 fields=self.get_explode_infos_fields(), 8357 force=True, 8358 ) 8359 8360 def calculation_process_sql( 8361 self, operation: dict, operation_name: str = "unknown" 8362 ) -> None: 8363 """ 8364 The `calculation_process_sql` function takes in a mathematical operation as a string and 8365 performs the operation, updating the specified table with the result. 8366 8367 :param operation: The `operation` parameter is a dictionary that contains information about the 8368 mathematical operation to be performed. It includes the following keys: 8369 :type operation: dict 8370 :param operation_name: The `operation_name` parameter is a string that represents the name of 8371 the mathematical operation being performed. It is used for logging and error handling purposes, 8372 defaults to unknown 8373 :type operation_name: str (optional) 8374 """ 8375 8376 # Operation infos 8377 operation_name = operation.get("name", "unknown") 8378 log.debug(f"process SQL {operation_name}") 8379 output_column_name = operation.get("output_column_name", operation_name) 8380 output_column_type = operation.get("output_column_type", "String") 8381 prefix = operation.get("explode_infos_prefix", "") 8382 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8383 output_column_description = operation.get( 8384 "output_column_description", f"{operation_name} operation" 8385 ) 8386 operation_query = operation.get("operation_query", None) 8387 if isinstance(operation_query, list): 8388 operation_query = " ".join(operation_query) 8389 operation_info_fields = operation.get("info_fields", []) 8390 operation_info_fields_check = operation.get("info_fields_check", False) 8391 operation_info = operation.get("operation_info", True) 8392 operation_table = operation.get( 8393 "table", self.get_table_variants(clause="alter") 8394 ) 8395 8396 # table variants 8397 if operation_table: 8398 table_variants = operation_table 8399 else: 8400 table_variants = self.get_table_variants(clause="alter") 8401 8402 if operation_query: 8403 8404 # Info fields check 8405 operation_info_fields_check_result = True 8406 if operation_info_fields_check: 8407 header_infos = self.get_header().infos 8408 for info_field in operation_info_fields: 8409 operation_info_fields_check_result = ( 8410 operation_info_fields_check_result 8411 and info_field in header_infos 8412 ) 8413 8414 # If info fields available 8415 if operation_info_fields_check_result: 8416 8417 # Added_columns 8418 added_columns = [] 8419 8420 # Create VCF header field 8421 vcf_reader = self.get_header() 8422 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8423 output_column_name, 8424 ".", 8425 output_column_type, 8426 output_column_description, 8427 "howard calculation", 8428 "0", 8429 self.code_type_map.get(output_column_type), 8430 ) 8431 8432 # Explode infos if needed 8433 log.debug(f"calculation_process_sql prefix {prefix}") 8434 added_columns += self.explode_infos( 8435 prefix=prefix, 8436 fields=[output_column_name] + operation_info_fields, 8437 force=False, 8438 table=table_variants, 8439 ) 8440 8441 # Create column 8442 added_column = self.add_column( 8443 table_name=table_variants, 8444 column_name=prefix + output_column_name, 8445 column_type=output_column_type_sql, 8446 default_value="null", 8447 ) 8448 added_columns.append(added_column) 8449 8450 # Operation calculation 8451 try: 8452 8453 # Query to update calculation column 8454 sql_update = f""" 8455 UPDATE {table_variants} 8456 SET "{prefix}{output_column_name}" = ({operation_query}) 8457 """ 8458 self.conn.execute(sql_update) 8459 8460 # Add to INFO 8461 if operation_info: 8462 sql_update_info = f""" 8463 UPDATE {table_variants} 8464 SET "INFO" = 8465 concat( 8466 CASE 8467 WHEN "INFO" IS NOT NULL 8468 THEN concat("INFO", ';') 8469 ELSE '' 8470 END, 8471 '{output_column_name}=', 8472 "{prefix}{output_column_name}" 8473 ) 8474 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8475 """ 8476 self.conn.execute(sql_update_info) 8477 8478 except: 8479 log.error( 8480 f"Operations config: Calculation '{operation_name}' query failed" 8481 ) 8482 raise ValueError( 8483 f"Operations config: Calculation '{operation_name}' query failed" 8484 ) 8485 8486 # Remove added columns 8487 for added_column in added_columns: 8488 log.debug(f"added_column: {added_column}") 8489 self.drop_column(column=added_column) 8490 8491 else: 8492 log.error( 8493 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8494 ) 8495 raise ValueError( 8496 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8497 ) 8498 8499 else: 8500 log.error( 8501 f"Operations config: Calculation '{operation_name}' query NOT defined" 8502 ) 8503 raise ValueError( 8504 f"Operations config: Calculation '{operation_name}' query NOT defined" 8505 ) 8506 8507 def calculation_process_function( 8508 self, operation: dict, operation_name: str = "unknown" 8509 ) -> None: 8510 """ 8511 The `calculation_process_function` takes in an operation dictionary and performs the specified 8512 function with the given parameters. 8513 8514 :param operation: The `operation` parameter is a dictionary that contains information about the 8515 operation to be performed. It has the following keys: 8516 :type operation: dict 8517 :param operation_name: The `operation_name` parameter is a string that represents the name of 8518 the operation being performed. It is used for logging purposes, defaults to unknown 8519 :type operation_name: str (optional) 8520 """ 8521 8522 operation_name = operation["name"] 8523 log.debug(f"process Python {operation_name}") 8524 function_name = operation["function_name"] 8525 function_params = operation["function_params"] 8526 getattr(self, function_name)(*function_params) 8527 8528 def calculation_variant_id(self) -> None: 8529 """ 8530 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8531 updates the INFO field of a variants table with the variant ID. 8532 """ 8533 8534 # variant_id annotation field 8535 variant_id_tag = self.get_variant_id_column() 8536 added_columns = [variant_id_tag] 8537 8538 # variant_id hgvs tags" 8539 vcf_infos_tags = { 8540 variant_id_tag: "howard variant ID annotation", 8541 } 8542 8543 # Variants table 8544 table_variants = self.get_table_variants() 8545 8546 # Header 8547 vcf_reader = self.get_header() 8548 8549 # Add variant_id to header 8550 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8551 variant_id_tag, 8552 ".", 8553 "String", 8554 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8555 "howard calculation", 8556 "0", 8557 self.code_type_map.get("String"), 8558 ) 8559 8560 # Update 8561 sql_update = f""" 8562 UPDATE {table_variants} 8563 SET "INFO" = 8564 concat( 8565 CASE 8566 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8567 THEN '' 8568 ELSE concat("INFO", ';') 8569 END, 8570 '{variant_id_tag}=', 8571 "{variant_id_tag}" 8572 ) 8573 """ 8574 self.conn.execute(sql_update) 8575 8576 # Remove added columns 8577 for added_column in added_columns: 8578 self.drop_column(column=added_column) 8579 8580 def calculation_extract_snpeff_hgvs( 8581 self, 8582 snpeff_hgvs: str = "snpeff_hgvs", 8583 snpeff_field: str = "ANN", 8584 ) -> None: 8585 """ 8586 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8587 annotation field in a VCF file and adds them as a new column in the variants table. 8588 8589 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8590 function is used to specify the name of the column that will store the HGVS nomenclatures 8591 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8592 snpeff_hgvs 8593 :type snpeff_hgvs: str (optional) 8594 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8595 function represents the field in the VCF file that contains SnpEff annotations. This field is 8596 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8597 to ANN 8598 :type snpeff_field: str (optional) 8599 """ 8600 8601 # Snpeff hgvs tags 8602 vcf_infos_tags = { 8603 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8604 } 8605 8606 # Prefix 8607 prefix = self.get_explode_infos_prefix() 8608 if prefix: 8609 prefix = "INFO/" 8610 8611 # snpEff fields 8612 speff_ann_infos = prefix + snpeff_field 8613 speff_hgvs_infos = prefix + snpeff_hgvs 8614 8615 # Variants table 8616 table_variants = self.get_table_variants() 8617 8618 # Header 8619 vcf_reader = self.get_header() 8620 8621 # Add columns 8622 added_columns = [] 8623 8624 # Explode HGVS field in column 8625 added_columns += self.explode_infos(fields=[snpeff_field]) 8626 8627 if snpeff_field in vcf_reader.infos: 8628 8629 log.debug(vcf_reader.infos[snpeff_field]) 8630 8631 # Extract ANN header 8632 ann_description = vcf_reader.infos[snpeff_field].desc 8633 pattern = r"'(.+?)'" 8634 match = re.search(pattern, ann_description) 8635 if match: 8636 ann_header_match = match.group(1).split(" | ") 8637 ann_header_desc = {} 8638 for i in range(len(ann_header_match)): 8639 ann_header_info = "".join( 8640 char for char in ann_header_match[i] if char.isalnum() 8641 ) 8642 ann_header_desc[ann_header_info] = ann_header_match[i] 8643 if not ann_header_desc: 8644 raise ValueError("Invalid header description format") 8645 else: 8646 raise ValueError("Invalid header description format") 8647 8648 # Create variant id 8649 variant_id_column = self.get_variant_id_column() 8650 added_columns += [variant_id_column] 8651 8652 # Create dataframe 8653 dataframe_snpeff_hgvs = self.get_query_to_df( 8654 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8655 ) 8656 8657 # Create main NOMEN column 8658 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8659 speff_ann_infos 8660 ].apply( 8661 lambda x: extract_snpeff_hgvs( 8662 str(x), header=list(ann_header_desc.values()) 8663 ) 8664 ) 8665 8666 # Add snpeff_hgvs to header 8667 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8668 snpeff_hgvs, 8669 ".", 8670 "String", 8671 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8672 "howard calculation", 8673 "0", 8674 self.code_type_map.get("String"), 8675 ) 8676 8677 # Update 8678 sql_update = f""" 8679 UPDATE variants 8680 SET "INFO" = 8681 concat( 8682 CASE 8683 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8684 THEN '' 8685 ELSE concat("INFO", ';') 8686 END, 8687 CASE 8688 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8689 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8690 THEN concat( 8691 '{snpeff_hgvs}=', 8692 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8693 ) 8694 ELSE '' 8695 END 8696 ) 8697 FROM dataframe_snpeff_hgvs 8698 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8699 8700 """ 8701 self.conn.execute(sql_update) 8702 8703 # Delete dataframe 8704 del dataframe_snpeff_hgvs 8705 gc.collect() 8706 8707 else: 8708 8709 log.warning( 8710 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8711 ) 8712 8713 # Remove added columns 8714 for added_column in added_columns: 8715 self.drop_column(column=added_column) 8716 8717 def calculation_snpeff_ann_explode( 8718 self, 8719 uniquify: bool = True, 8720 output_format: str = "fields", 8721 output_prefix: str = "snpeff_", 8722 snpeff_field: str = "ANN", 8723 ) -> None: 8724 """ 8725 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8726 exploding the HGVS field and updating variant information accordingly. 8727 8728 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8729 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8730 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8731 defaults to True 8732 :type uniquify: bool (optional) 8733 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8734 function specifies the format in which the output annotations will be generated. It has a 8735 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8736 format, defaults to fields 8737 :type output_format: str (optional) 8738 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8739 method is used to specify the prefix that will be added to the output annotations generated 8740 during the calculation process. This prefix helps to differentiate the newly added annotations 8741 from existing ones in the output data. By default, the, defaults to ANN_ 8742 :type output_prefix: str (optional) 8743 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8744 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8745 field will be processed to explode the HGVS annotations and update the variant information 8746 accordingly, defaults to ANN 8747 :type snpeff_field: str (optional) 8748 """ 8749 8750 # SnpEff annotation field 8751 snpeff_hgvs = "snpeff_ann_explode" 8752 8753 # Snpeff hgvs tags 8754 vcf_infos_tags = { 8755 snpeff_hgvs: "Explode snpEff annotations", 8756 } 8757 8758 # Prefix 8759 prefix = self.get_explode_infos_prefix() 8760 if prefix: 8761 prefix = "INFO/" 8762 8763 # snpEff fields 8764 speff_ann_infos = prefix + snpeff_field 8765 speff_hgvs_infos = prefix + snpeff_hgvs 8766 8767 # Variants table 8768 table_variants = self.get_table_variants() 8769 8770 # Header 8771 vcf_reader = self.get_header() 8772 8773 # Add columns 8774 added_columns = [] 8775 8776 # Explode HGVS field in column 8777 added_columns += self.explode_infos(fields=[snpeff_field]) 8778 log.debug(f"snpeff_field={snpeff_field}") 8779 log.debug(f"added_columns={added_columns}") 8780 8781 if snpeff_field in vcf_reader.infos: 8782 8783 # Extract ANN header 8784 ann_description = vcf_reader.infos[snpeff_field].desc 8785 pattern = r"'(.+?)'" 8786 match = re.search(pattern, ann_description) 8787 if match: 8788 ann_header_match = match.group(1).split(" | ") 8789 ann_header = [] 8790 ann_header_desc = {} 8791 for i in range(len(ann_header_match)): 8792 ann_header_info = "".join( 8793 char for char in ann_header_match[i] if char.isalnum() 8794 ) 8795 ann_header.append(ann_header_info) 8796 ann_header_desc[ann_header_info] = ann_header_match[i] 8797 if not ann_header_desc: 8798 raise ValueError("Invalid header description format") 8799 else: 8800 raise ValueError("Invalid header description format") 8801 8802 # Create variant id 8803 variant_id_column = self.get_variant_id_column() 8804 added_columns += [variant_id_column] 8805 8806 # Create dataframe 8807 dataframe_snpeff_hgvs = self.get_query_to_df( 8808 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8809 ) 8810 8811 # Create snpEff columns 8812 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8813 speff_ann_infos 8814 ].apply( 8815 lambda x: explode_snpeff_ann( 8816 str(x), 8817 uniquify=uniquify, 8818 output_format=output_format, 8819 prefix=output_prefix, 8820 header=list(ann_header_desc.values()), 8821 ) 8822 ) 8823 8824 # Header 8825 ann_annotations_prefix = "" 8826 if output_format.upper() in ["JSON"]: 8827 ann_annotations_prefix = f"{output_prefix}=" 8828 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8829 output_prefix, 8830 ".", 8831 "String", 8832 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8833 + " - JSON format", 8834 "howard calculation", 8835 "0", 8836 self.code_type_map.get("String"), 8837 ) 8838 else: 8839 for ann_annotation in ann_header: 8840 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8841 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8842 ann_annotation_id, 8843 ".", 8844 "String", 8845 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8846 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8847 "howard calculation", 8848 "0", 8849 self.code_type_map.get("String"), 8850 ) 8851 8852 # Update 8853 sql_update = f""" 8854 UPDATE variants 8855 SET "INFO" = 8856 concat( 8857 CASE 8858 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8859 THEN '' 8860 ELSE concat("INFO", ';') 8861 END, 8862 CASE 8863 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8864 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8865 THEN concat( 8866 '{ann_annotations_prefix}', 8867 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8868 ) 8869 ELSE '' 8870 END 8871 ) 8872 FROM dataframe_snpeff_hgvs 8873 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8874 8875 """ 8876 self.conn.execute(sql_update) 8877 8878 # Delete dataframe 8879 del dataframe_snpeff_hgvs 8880 gc.collect() 8881 8882 else: 8883 8884 log.warning( 8885 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8886 ) 8887 8888 # Remove added columns 8889 for added_column in added_columns: 8890 self.drop_column(column=added_column) 8891 8892 def calculation_extract_nomen(self) -> None: 8893 """ 8894 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8895 """ 8896 8897 # NOMEN field 8898 field_nomen_dict = "NOMEN_DICT" 8899 8900 # NOMEN structure 8901 nomen_dict = { 8902 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8903 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8904 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8905 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8906 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8907 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8908 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8909 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8910 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8911 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8912 } 8913 8914 # Param 8915 param = self.get_param() 8916 8917 # Threads 8918 threads = self.get_threads() 8919 8920 # Prefix 8921 prefix = self.get_explode_infos_prefix() 8922 8923 # Header 8924 vcf_reader = self.get_header() 8925 8926 # Added columns 8927 added_columns = [] 8928 8929 # Get HGVS field 8930 hgvs_field = ( 8931 param.get("calculation", {}) 8932 .get("calculations", {}) 8933 .get("NOMEN", {}) 8934 .get("options", {}) 8935 .get("hgvs_field", "hgvs") 8936 ) 8937 8938 # Get NOMEN pattern 8939 nomen_pattern = ( 8940 param.get("calculation", {}) 8941 .get("calculations", {}) 8942 .get("NOMEN", {}) 8943 .get("options", {}) 8944 .get("pattern", None) 8945 ) 8946 8947 # transcripts list of preference sources 8948 transcripts_sources = {} 8949 8950 # Get transcripts 8951 transcripts_file = ( 8952 param.get("calculation", {}) 8953 .get("calculations", {}) 8954 .get("NOMEN", {}) 8955 .get("options", {}) 8956 .get("transcripts", None) 8957 ) 8958 transcripts_file = full_path(transcripts_file) 8959 if transcripts_file: 8960 if os.path.exists(transcripts_file): 8961 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8962 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8963 transcripts_sources["file"] = transcripts_from_file 8964 else: 8965 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8966 log.error(msg_err) 8967 raise ValueError(msg_err) 8968 8969 # Get transcripts table 8970 transcripts_table = ( 8971 param.get("calculation", {}) 8972 .get("calculations", {}) 8973 .get("NOMEN", {}) 8974 .get("options", {}) 8975 .get("transcripts_table", self.get_table_variants()) 8976 ) 8977 # Get transcripts column 8978 transcripts_column = ( 8979 param.get("calculation", {}) 8980 .get("calculations", {}) 8981 .get("NOMEN", {}) 8982 .get("options", {}) 8983 .get("transcripts_column", None) 8984 ) 8985 8986 if transcripts_table and transcripts_column: 8987 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8988 # Explode if not exists 8989 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8990 else: 8991 extra_field_transcript = f"NULL" 8992 8993 # Transcripts of preference source order 8994 transcripts_order = ( 8995 param.get("calculation", {}) 8996 .get("calculations", {}) 8997 .get("NOMEN", {}) 8998 .get("options", {}) 8999 .get("transcripts_order", ["column", "file"]) 9000 ) 9001 9002 # Transcripts from file 9003 transcripts = transcripts_sources.get("file", []) 9004 9005 # Explode HGVS field in column 9006 added_columns += self.explode_infos(fields=[hgvs_field]) 9007 9008 # extra infos 9009 extra_infos = self.get_extra_infos() 9010 extra_field = prefix + hgvs_field 9011 9012 if extra_field in extra_infos: 9013 9014 # Create dataframe 9015 dataframe_hgvs = self.get_query_to_df( 9016 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9017 ) 9018 9019 # Transcripts rank 9020 transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)} 9021 transcripts_len = len(transcripts_rank) 9022 9023 # Create main NOMEN column 9024 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9025 lambda x: find_nomen( 9026 hgvs=x.hgvs, 9027 transcript=x.transcript, 9028 transcripts=transcripts_rank, 9029 pattern=nomen_pattern, 9030 transcripts_source_order=transcripts_order, 9031 transcripts_len=transcripts_len 9032 ), 9033 axis=1, 9034 ) 9035 9036 # Explode NOMEN Structure and create SQL set for update 9037 sql_nomen_fields = [] 9038 for nomen_field in nomen_dict: 9039 9040 # Create VCF header field 9041 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9042 nomen_field, 9043 ".", 9044 "String", 9045 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9046 "howard calculation", 9047 "0", 9048 self.code_type_map.get("String"), 9049 ) 9050 9051 # Add field to SQL query update 9052 sql_nomen_fields.append( 9053 f""" 9054 CASE 9055 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9056 THEN concat( 9057 ';{nomen_field}=', 9058 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9059 ) 9060 ELSE '' 9061 END 9062 """ 9063 ) 9064 9065 # SQL set for update 9066 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9067 9068 # Update 9069 sql_update = f""" 9070 UPDATE variants 9071 SET "INFO" = 9072 concat( 9073 CASE 9074 WHEN "INFO" IS NULL 9075 THEN '' 9076 ELSE "INFO" 9077 END, 9078 {sql_nomen_fields_set} 9079 ) 9080 FROM dataframe_hgvs 9081 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9082 AND variants."POS" = dataframe_hgvs."POS" 9083 AND variants."REF" = dataframe_hgvs."REF" 9084 AND variants."ALT" = dataframe_hgvs."ALT" 9085 """ 9086 self.conn.execute(sql_update) 9087 9088 # Delete dataframe 9089 del dataframe_hgvs 9090 gc.collect() 9091 9092 # Remove added columns 9093 for added_column in added_columns: 9094 self.drop_column(column=added_column) 9095 9096 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9097 """ 9098 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9099 pipeline/sample for a variant and updates the variant information in a VCF file. 9100 9101 :param tag: The `tag` parameter is a string that represents the annotation field for the 9102 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9103 VCF header and to update the corresponding field in the variants table, defaults to 9104 findbypipeline 9105 :type tag: str (optional) 9106 """ 9107 9108 # if FORMAT and samples 9109 if ( 9110 "FORMAT" in self.get_header_columns_as_list() 9111 and self.get_header_sample_list() 9112 ): 9113 9114 # findbypipeline annotation field 9115 findbypipeline_tag = tag 9116 9117 # VCF infos tags 9118 vcf_infos_tags = { 9119 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9120 } 9121 9122 # Prefix 9123 prefix = self.get_explode_infos_prefix() 9124 9125 # Field 9126 findbypipeline_infos = prefix + findbypipeline_tag 9127 9128 # Variants table 9129 table_variants = self.get_table_variants() 9130 9131 # Header 9132 vcf_reader = self.get_header() 9133 9134 # Create variant id 9135 variant_id_column = self.get_variant_id_column() 9136 added_columns = [variant_id_column] 9137 9138 # variant_id, FORMAT and samples 9139 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9140 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9141 ) 9142 9143 # Create dataframe 9144 dataframe_findbypipeline = self.get_query_to_df( 9145 f""" SELECT {samples_fields} FROM {table_variants} """ 9146 ) 9147 9148 # Create findbypipeline column 9149 dataframe_findbypipeline[findbypipeline_infos] = ( 9150 dataframe_findbypipeline.apply( 9151 lambda row: findbypipeline( 9152 row, samples=self.get_header_sample_list() 9153 ), 9154 axis=1, 9155 ) 9156 ) 9157 9158 # Add snpeff_hgvs to header 9159 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9160 findbypipeline_tag, 9161 ".", 9162 "String", 9163 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9164 "howard calculation", 9165 "0", 9166 self.code_type_map.get("String"), 9167 ) 9168 9169 # Update 9170 sql_update = f""" 9171 UPDATE variants 9172 SET "INFO" = 9173 concat( 9174 CASE 9175 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9176 THEN '' 9177 ELSE concat("INFO", ';') 9178 END, 9179 CASE 9180 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9181 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9182 THEN concat( 9183 '{findbypipeline_tag}=', 9184 dataframe_findbypipeline."{findbypipeline_infos}" 9185 ) 9186 ELSE '' 9187 END 9188 ) 9189 FROM dataframe_findbypipeline 9190 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9191 """ 9192 self.conn.execute(sql_update) 9193 9194 # Remove added columns 9195 for added_column in added_columns: 9196 self.drop_column(column=added_column) 9197 9198 # Delete dataframe 9199 del dataframe_findbypipeline 9200 gc.collect() 9201 9202 def calculation_genotype_concordance(self) -> None: 9203 """ 9204 The function `calculation_genotype_concordance` calculates the genotype concordance for 9205 multi-caller VCF files and updates the variant information in the database. 9206 """ 9207 9208 # if FORMAT and samples 9209 if ( 9210 "FORMAT" in self.get_header_columns_as_list() 9211 and self.get_header_sample_list() 9212 ): 9213 9214 # genotypeconcordance annotation field 9215 genotypeconcordance_tag = "genotypeconcordance" 9216 9217 # VCF infos tags 9218 vcf_infos_tags = { 9219 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9220 } 9221 9222 # Prefix 9223 prefix = self.get_explode_infos_prefix() 9224 9225 # Field 9226 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9227 9228 # Variants table 9229 table_variants = self.get_table_variants() 9230 9231 # Header 9232 vcf_reader = self.get_header() 9233 9234 # Create variant id 9235 variant_id_column = self.get_variant_id_column() 9236 added_columns = [variant_id_column] 9237 9238 # variant_id, FORMAT and samples 9239 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9240 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9241 ) 9242 9243 # Create dataframe 9244 dataframe_genotypeconcordance = self.get_query_to_df( 9245 f""" SELECT {samples_fields} FROM {table_variants} """ 9246 ) 9247 9248 # Create genotypeconcordance column 9249 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9250 dataframe_genotypeconcordance.apply( 9251 lambda row: genotypeconcordance( 9252 row, samples=self.get_header_sample_list() 9253 ), 9254 axis=1, 9255 ) 9256 ) 9257 9258 # Add genotypeconcordance to header 9259 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9260 genotypeconcordance_tag, 9261 ".", 9262 "String", 9263 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9264 "howard calculation", 9265 "0", 9266 self.code_type_map.get("String"), 9267 ) 9268 9269 # Update 9270 sql_update = f""" 9271 UPDATE variants 9272 SET "INFO" = 9273 concat( 9274 CASE 9275 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9276 THEN '' 9277 ELSE concat("INFO", ';') 9278 END, 9279 CASE 9280 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9281 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9282 THEN concat( 9283 '{genotypeconcordance_tag}=', 9284 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9285 ) 9286 ELSE '' 9287 END 9288 ) 9289 FROM dataframe_genotypeconcordance 9290 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9291 """ 9292 self.conn.execute(sql_update) 9293 9294 # Remove added columns 9295 for added_column in added_columns: 9296 self.drop_column(column=added_column) 9297 9298 # Delete dataframe 9299 del dataframe_genotypeconcordance 9300 gc.collect() 9301 9302 def calculation_barcode(self, tag: str = "barcode") -> None: 9303 """ 9304 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9305 updates the INFO field in the file with the calculated barcode values. 9306 9307 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9308 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9309 the default tag name is set to "barcode", defaults to barcode 9310 :type tag: str (optional) 9311 """ 9312 9313 # if FORMAT and samples 9314 if ( 9315 "FORMAT" in self.get_header_columns_as_list() 9316 and self.get_header_sample_list() 9317 ): 9318 9319 # barcode annotation field 9320 if not tag: 9321 tag = "barcode" 9322 9323 # VCF infos tags 9324 vcf_infos_tags = { 9325 tag: "barcode calculation (VaRank)", 9326 } 9327 9328 # Prefix 9329 prefix = self.get_explode_infos_prefix() 9330 9331 # Field 9332 barcode_infos = prefix + tag 9333 9334 # Variants table 9335 table_variants = self.get_table_variants() 9336 9337 # Header 9338 vcf_reader = self.get_header() 9339 9340 # Create variant id 9341 variant_id_column = self.get_variant_id_column() 9342 added_columns = [variant_id_column] 9343 9344 # variant_id, FORMAT and samples 9345 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9346 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9347 ) 9348 9349 # Create dataframe 9350 dataframe_barcode = self.get_query_to_df( 9351 f""" SELECT {samples_fields} FROM {table_variants} """ 9352 ) 9353 9354 # Create barcode column 9355 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9356 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9357 ) 9358 9359 # Add barcode to header 9360 vcf_reader.infos[tag] = vcf.parser._Info( 9361 tag, 9362 ".", 9363 "String", 9364 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9365 "howard calculation", 9366 "0", 9367 self.code_type_map.get("String"), 9368 ) 9369 9370 # Update 9371 sql_update = f""" 9372 UPDATE {table_variants} 9373 SET "INFO" = 9374 concat( 9375 CASE 9376 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9377 THEN '' 9378 ELSE concat("INFO", ';') 9379 END, 9380 CASE 9381 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9382 AND dataframe_barcode."{barcode_infos}" NOT NULL 9383 THEN concat( 9384 '{tag}=', 9385 dataframe_barcode."{barcode_infos}" 9386 ) 9387 ELSE '' 9388 END 9389 ) 9390 FROM dataframe_barcode 9391 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9392 """ 9393 self.conn.execute(sql_update) 9394 9395 # Remove added columns 9396 for added_column in added_columns: 9397 self.drop_column(column=added_column) 9398 9399 # Delete dataframe 9400 del dataframe_barcode 9401 gc.collect() 9402 9403 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9404 """ 9405 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9406 and updates the INFO field in the file with the calculated barcode values. 9407 9408 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9409 the barcode tag that will be added to the VCF file during the calculation process. If no value 9410 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9411 :type tag: str (optional) 9412 """ 9413 9414 # if FORMAT and samples 9415 if ( 9416 "FORMAT" in self.get_header_columns_as_list() 9417 and self.get_header_sample_list() 9418 ): 9419 9420 # barcode annotation field 9421 if not tag: 9422 tag = "BCF" 9423 9424 # VCF infos tags 9425 vcf_infos_tags = { 9426 tag: "barcode family calculation", 9427 f"{tag}S": "barcode family samples", 9428 } 9429 9430 # Param 9431 param = self.get_param() 9432 log.debug(f"param={param}") 9433 9434 # Prefix 9435 prefix = self.get_explode_infos_prefix() 9436 9437 # PED param 9438 ped = ( 9439 param.get("calculation", {}) 9440 .get("calculations", {}) 9441 .get("BARCODEFAMILY", {}) 9442 .get("family_pedigree", None) 9443 ) 9444 log.debug(f"ped={ped}") 9445 9446 # Load PED 9447 if ped: 9448 9449 # Pedigree is a file 9450 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9451 log.debug("Pedigree is file") 9452 with open(full_path(ped)) as ped: 9453 ped = yaml.safe_load(ped) 9454 9455 # Pedigree is a string 9456 elif isinstance(ped, str): 9457 log.debug("Pedigree is str") 9458 try: 9459 ped = json.loads(ped) 9460 log.debug("Pedigree is json str") 9461 except ValueError as e: 9462 ped_samples = ped.split(",") 9463 ped = {} 9464 for ped_sample in ped_samples: 9465 ped[ped_sample] = ped_sample 9466 9467 # Pedigree is a dict 9468 elif isinstance(ped, dict): 9469 log.debug("Pedigree is dict") 9470 9471 # Pedigree is not well formatted 9472 else: 9473 msg_error = "Pedigree not well formatted" 9474 log.error(msg_error) 9475 raise ValueError(msg_error) 9476 9477 # Construct list 9478 ped_samples = list(ped.values()) 9479 9480 else: 9481 log.debug("Pedigree not defined. Take all samples") 9482 ped_samples = self.get_header_sample_list() 9483 ped = {} 9484 for ped_sample in ped_samples: 9485 ped[ped_sample] = ped_sample 9486 9487 # Check pedigree 9488 if not ped or len(ped) == 0: 9489 msg_error = f"Error in pedigree: samples {ped_samples}" 9490 log.error(msg_error) 9491 raise ValueError(msg_error) 9492 9493 # Log 9494 log.info( 9495 "Calculation 'BARCODEFAMILY' - Samples: " 9496 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9497 ) 9498 log.debug(f"ped_samples={ped_samples}") 9499 9500 # Field 9501 barcode_infos = prefix + tag 9502 9503 # Variants table 9504 table_variants = self.get_table_variants() 9505 9506 # Header 9507 vcf_reader = self.get_header() 9508 9509 # Create variant id 9510 variant_id_column = self.get_variant_id_column() 9511 added_columns = [variant_id_column] 9512 9513 # variant_id, FORMAT and samples 9514 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9515 [f""" "{sample}" """ for sample in ped_samples] 9516 ) 9517 9518 # Create dataframe 9519 dataframe_barcode = self.get_query_to_df( 9520 f""" SELECT {samples_fields} FROM {table_variants} """ 9521 ) 9522 9523 # Create barcode column 9524 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9525 lambda row: barcode(row, samples=ped_samples), axis=1 9526 ) 9527 9528 # Add barcode family to header 9529 # Add vaf_normalization to header 9530 vcf_reader.formats[tag] = vcf.parser._Format( 9531 id=tag, 9532 num=".", 9533 type="String", 9534 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9535 type_code=self.code_type_map.get("String"), 9536 ) 9537 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9538 id=f"{tag}S", 9539 num=".", 9540 type="String", 9541 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9542 type_code=self.code_type_map.get("String"), 9543 ) 9544 9545 # Update 9546 # for sample in ped_samples: 9547 sql_update_set = [] 9548 for sample in self.get_header_sample_list() + ["FORMAT"]: 9549 if sample in ped_samples: 9550 value = f'dataframe_barcode."{barcode_infos}"' 9551 value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'" 9552 ped_samples 9553 elif sample == "FORMAT": 9554 value = f"'{tag}'" 9555 value_samples = f"'{tag}S'" 9556 else: 9557 value = "'.'" 9558 value_samples = "'.'" 9559 format_regex = r"[a-zA-Z0-9\s]" 9560 sql_update_set.append( 9561 f""" 9562 "{sample}" = 9563 concat( 9564 CASE 9565 WHEN {table_variants}."{sample}" = './.' 9566 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9567 ELSE {table_variants}."{sample}" 9568 END, 9569 ':', 9570 {value}, 9571 ':', 9572 {value_samples} 9573 ) 9574 """ 9575 ) 9576 9577 sql_update_set_join = ", ".join(sql_update_set) 9578 sql_update = f""" 9579 UPDATE {table_variants} 9580 SET {sql_update_set_join} 9581 FROM dataframe_barcode 9582 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9583 """ 9584 self.conn.execute(sql_update) 9585 9586 # Remove added columns 9587 for added_column in added_columns: 9588 self.drop_column(column=added_column) 9589 9590 # Delete dataframe 9591 del dataframe_barcode 9592 gc.collect() 9593 9594 def calculation_trio(self) -> None: 9595 """ 9596 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9597 information to the INFO field of each variant. 9598 """ 9599 9600 # if FORMAT and samples 9601 if ( 9602 "FORMAT" in self.get_header_columns_as_list() 9603 and self.get_header_sample_list() 9604 ): 9605 9606 # trio annotation field 9607 trio_tag = "trio" 9608 9609 # VCF infos tags 9610 vcf_infos_tags = { 9611 "trio": "trio calculation", 9612 } 9613 9614 # Param 9615 param = self.get_param() 9616 9617 # Prefix 9618 prefix = self.get_explode_infos_prefix() 9619 9620 # Trio param 9621 trio_ped = ( 9622 param.get("calculation", {}) 9623 .get("calculations", {}) 9624 .get("TRIO", {}) 9625 .get("trio_pedigree", None) 9626 ) 9627 9628 # Load trio 9629 if trio_ped: 9630 9631 # Trio pedigree is a file 9632 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9633 log.debug("TRIO pedigree is file") 9634 with open(full_path(trio_ped)) as trio_ped: 9635 trio_ped = yaml.safe_load(trio_ped) 9636 9637 # Trio pedigree is a string 9638 elif isinstance(trio_ped, str): 9639 log.debug("TRIO pedigree is str") 9640 try: 9641 trio_ped = json.loads(trio_ped) 9642 log.debug("TRIO pedigree is json str") 9643 except ValueError as e: 9644 trio_samples = trio_ped.split(",") 9645 if len(trio_samples) == 3: 9646 trio_ped = { 9647 "father": trio_samples[0], 9648 "mother": trio_samples[1], 9649 "child": trio_samples[2], 9650 } 9651 log.debug("TRIO pedigree is list str") 9652 else: 9653 msg_error = "TRIO pedigree not well formatted" 9654 log.error(msg_error) 9655 raise ValueError(msg_error) 9656 9657 # Trio pedigree is a dict 9658 elif isinstance(trio_ped, dict): 9659 log.debug("TRIO pedigree is dict") 9660 9661 # Trio pedigree is not well formatted 9662 else: 9663 msg_error = "TRIO pedigree not well formatted" 9664 log.error(msg_error) 9665 raise ValueError(msg_error) 9666 9667 # Construct trio list 9668 trio_samples = [ 9669 trio_ped.get("father", ""), 9670 trio_ped.get("mother", ""), 9671 trio_ped.get("child", ""), 9672 ] 9673 9674 else: 9675 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9676 samples_list = self.get_header_sample_list() 9677 if len(samples_list) >= 3: 9678 trio_samples = self.get_header_sample_list()[0:3] 9679 trio_ped = { 9680 "father": trio_samples[0], 9681 "mother": trio_samples[1], 9682 "child": trio_samples[2], 9683 } 9684 else: 9685 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9686 log.error(msg_error) 9687 raise ValueError(msg_error) 9688 9689 # Check trio pedigree 9690 if not trio_ped or len(trio_ped) != 3: 9691 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9692 log.error(msg_error) 9693 raise ValueError(msg_error) 9694 9695 # Log 9696 log.info( 9697 f"Calculation 'TRIO' - Samples: " 9698 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9699 ) 9700 9701 # Field 9702 trio_infos = prefix + trio_tag 9703 9704 # Variants table 9705 table_variants = self.get_table_variants() 9706 9707 # Header 9708 vcf_reader = self.get_header() 9709 9710 # Create variant id 9711 variant_id_column = self.get_variant_id_column() 9712 added_columns = [variant_id_column] 9713 9714 # variant_id, FORMAT and samples 9715 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9716 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9717 ) 9718 9719 # Create dataframe 9720 dataframe_trio = self.get_query_to_df( 9721 f""" SELECT {samples_fields} FROM {table_variants} """ 9722 ) 9723 9724 # Create trio column 9725 dataframe_trio[trio_infos] = dataframe_trio.apply( 9726 lambda row: trio(row, samples=trio_samples), axis=1 9727 ) 9728 9729 # Add trio to header 9730 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9731 trio_tag, 9732 ".", 9733 "String", 9734 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9735 "howard calculation", 9736 "0", 9737 self.code_type_map.get("String"), 9738 ) 9739 9740 # Update 9741 sql_update = f""" 9742 UPDATE {table_variants} 9743 SET "INFO" = 9744 concat( 9745 CASE 9746 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9747 THEN '' 9748 ELSE concat("INFO", ';') 9749 END, 9750 CASE 9751 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9752 AND dataframe_trio."{trio_infos}" NOT NULL 9753 THEN concat( 9754 '{trio_tag}=', 9755 dataframe_trio."{trio_infos}" 9756 ) 9757 ELSE '' 9758 END 9759 ) 9760 FROM dataframe_trio 9761 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9762 """ 9763 self.conn.execute(sql_update) 9764 9765 # Remove added columns 9766 for added_column in added_columns: 9767 self.drop_column(column=added_column) 9768 9769 # Delete dataframe 9770 del dataframe_trio 9771 gc.collect() 9772 9773 def calculation_vaf_normalization(self) -> None: 9774 """ 9775 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9776 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9777 :return: The function does not return anything. 9778 """ 9779 9780 # if FORMAT and samples 9781 if ( 9782 "FORMAT" in self.get_header_columns_as_list() 9783 and self.get_header_sample_list() 9784 ): 9785 9786 # vaf_normalization annotation field 9787 vaf_normalization_tag = "VAF" 9788 9789 # VCF infos tags 9790 vcf_infos_tags = { 9791 "VAF": "VAF Variant Frequency", 9792 } 9793 9794 # Prefix 9795 prefix = self.get_explode_infos_prefix() 9796 9797 # Variants table 9798 table_variants = self.get_table_variants() 9799 9800 # Header 9801 vcf_reader = self.get_header() 9802 9803 # Do not calculate if VAF already exists 9804 if "VAF" in vcf_reader.formats: 9805 log.debug("VAF already on genotypes") 9806 return 9807 9808 # Create variant id 9809 variant_id_column = self.get_variant_id_column() 9810 added_columns = [variant_id_column] 9811 9812 # variant_id, FORMAT and samples 9813 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9814 f""" "{sample}" """ for sample in self.get_header_sample_list() 9815 ) 9816 9817 # Create dataframe 9818 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9819 log.debug(f"query={query}") 9820 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9821 9822 vaf_normalization_set = [] 9823 9824 # for each sample vaf_normalization 9825 for sample in self.get_header_sample_list(): 9826 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9827 lambda row: vaf_normalization(row, sample=sample), axis=1 9828 ) 9829 vaf_normalization_set.append( 9830 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9831 ) 9832 9833 # Add VAF to FORMAT 9834 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9835 "FORMAT" 9836 ].apply(lambda x: str(x) + ":VAF") 9837 vaf_normalization_set.append( 9838 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9839 ) 9840 9841 # Add vaf_normalization to header 9842 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9843 id=vaf_normalization_tag, 9844 num="1", 9845 type="Float", 9846 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9847 type_code=self.code_type_map.get("Float"), 9848 ) 9849 9850 # Create fields to add in INFO 9851 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9852 9853 # Update 9854 sql_update = f""" 9855 UPDATE {table_variants} 9856 SET {sql_vaf_normalization_set} 9857 FROM dataframe_vaf_normalization 9858 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9859 9860 """ 9861 self.conn.execute(sql_update) 9862 9863 # Remove added columns 9864 for added_column in added_columns: 9865 self.drop_column(column=added_column) 9866 9867 # Delete dataframe 9868 del dataframe_vaf_normalization 9869 gc.collect() 9870 9871 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9872 """ 9873 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9874 field in a VCF file and updates the INFO column of the variants table with the calculated 9875 statistics. 9876 9877 :param info: The `info` parameter is a string that represents the type of information for which 9878 genotype statistics are calculated. It is used to generate various VCF info tags for the 9879 statistics, such as the number of occurrences, the list of values, the minimum value, the 9880 maximum value, the mean, the median, defaults to VAF 9881 :type info: str (optional) 9882 """ 9883 9884 # if FORMAT and samples 9885 if ( 9886 "FORMAT" in self.get_header_columns_as_list() 9887 and self.get_header_sample_list() 9888 ): 9889 9890 # vaf_stats annotation field 9891 vaf_stats_tag = info + "_stats" 9892 9893 # VCF infos tags 9894 vcf_infos_tags = { 9895 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9896 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9897 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9898 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9899 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9900 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9901 info 9902 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9903 } 9904 9905 # Prefix 9906 prefix = self.get_explode_infos_prefix() 9907 9908 # Field 9909 vaf_stats_infos = prefix + vaf_stats_tag 9910 9911 # Variants table 9912 table_variants = self.get_table_variants() 9913 9914 # Header 9915 vcf_reader = self.get_header() 9916 9917 # Create variant id 9918 variant_id_column = self.get_variant_id_column() 9919 added_columns = [variant_id_column] 9920 9921 # variant_id, FORMAT and samples 9922 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9923 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9924 ) 9925 9926 # Create dataframe 9927 dataframe_vaf_stats = self.get_query_to_df( 9928 f""" SELECT {samples_fields} FROM {table_variants} """ 9929 ) 9930 9931 # Create vaf_stats column 9932 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9933 lambda row: genotype_stats( 9934 row, samples=self.get_header_sample_list(), info=info 9935 ), 9936 axis=1, 9937 ) 9938 9939 # List of vcf tags 9940 sql_vaf_stats_fields = [] 9941 9942 # Check all VAF stats infos 9943 for stat in vcf_infos_tags: 9944 9945 # Extract stats 9946 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9947 lambda x: dict(x).get(stat, "") 9948 ) 9949 9950 # Add snpeff_hgvs to header 9951 vcf_reader.infos[stat] = vcf.parser._Info( 9952 stat, 9953 ".", 9954 "String", 9955 vcf_infos_tags.get(stat, "genotype statistics"), 9956 "howard calculation", 9957 "0", 9958 self.code_type_map.get("String"), 9959 ) 9960 9961 if len(sql_vaf_stats_fields): 9962 sep = ";" 9963 else: 9964 sep = "" 9965 9966 # Create fields to add in INFO 9967 sql_vaf_stats_fields.append( 9968 f""" 9969 CASE 9970 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9971 THEN concat( 9972 '{sep}{stat}=', 9973 dataframe_vaf_stats."{stat}" 9974 ) 9975 ELSE '' 9976 END 9977 """ 9978 ) 9979 9980 # SQL set for update 9981 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9982 9983 # Update 9984 sql_update = f""" 9985 UPDATE {table_variants} 9986 SET "INFO" = 9987 concat( 9988 CASE 9989 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9990 THEN '' 9991 ELSE concat("INFO", ';') 9992 END, 9993 {sql_vaf_stats_fields_set} 9994 ) 9995 FROM dataframe_vaf_stats 9996 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9997 9998 """ 9999 self.conn.execute(sql_update) 10000 10001 # Remove added columns 10002 for added_column in added_columns: 10003 self.drop_column(column=added_column) 10004 10005 # Delete dataframe 10006 del dataframe_vaf_stats 10007 gc.collect() 10008 10009 def calculation_transcripts_annotation( 10010 self, info_json: str = None, info_format: str = None 10011 ) -> None: 10012 """ 10013 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10014 field to it if transcripts are available. 10015 10016 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10017 is a string parameter that represents the information field to be used in the transcripts JSON. 10018 It is used to specify the JSON format for the transcripts information. If no value is provided 10019 when calling the method, it defaults to " 10020 :type info_json: str 10021 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10022 method is a string parameter that specifies the format of the information field to be used in 10023 the transcripts JSON. It is used to define the format of the information field 10024 :type info_format: str 10025 """ 10026 10027 # Create transcripts table 10028 transcripts_table = self.create_transcript_view() 10029 10030 # Add info field 10031 if transcripts_table: 10032 self.transcript_view_to_variants( 10033 transcripts_table=transcripts_table, 10034 transcripts_info_field_json=info_json, 10035 transcripts_info_field_format=info_format, 10036 ) 10037 else: 10038 log.info("No Transcripts to process. Check param.json file configuration") 10039 10040 def calculation_transcripts_prioritization(self) -> None: 10041 """ 10042 The function `calculation_transcripts_prioritization` creates a transcripts table and 10043 prioritizes transcripts based on certain criteria. 10044 """ 10045 10046 # Create transcripts table 10047 transcripts_table = self.create_transcript_view() 10048 10049 # Add info field 10050 if transcripts_table: 10051 self.transcripts_prioritization(transcripts_table=transcripts_table) 10052 else: 10053 log.info("No Transcripts to process. Check param.json file configuration") 10054 10055 def calculation_transcripts_export(self) -> None: 10056 """ """ 10057 10058 # Create transcripts table 10059 transcripts_table = self.create_transcript_view() 10060 10061 # Add info field 10062 if transcripts_table: 10063 self.transcripts_export(transcripts_table=transcripts_table) 10064 else: 10065 log.info("No Transcripts to process. Check param.json file configuration") 10066 10067 ############### 10068 # Transcripts # 10069 ############### 10070 10071 def transcripts_export( 10072 self, transcripts_table: str = None, param: dict = {} 10073 ) -> bool: 10074 """ """ 10075 10076 log.debug("Start transcripts export...") 10077 10078 # Param 10079 if not param: 10080 param = self.get_param() 10081 10082 # Param export 10083 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10084 10085 # Output file 10086 transcripts_export_output = param_transcript_export.get("output", None) 10087 10088 if not param_transcript_export or not transcripts_export_output: 10089 log.warning(f"No transcriipts export parameters defined!") 10090 return False 10091 10092 # List of transcripts annotations 10093 query_describe = f""" 10094 SELECT column_name 10095 FROM ( 10096 DESCRIBE SELECT * FROM {transcripts_table} 10097 ) 10098 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10099 """ 10100 transcripts_annotations_list = list( 10101 self.get_query_to_df(query=query_describe)["column_name"] 10102 ) 10103 10104 # Create transcripts table for export 10105 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10106 random.choices(string.ascii_uppercase + string.digits, k=10) 10107 ) 10108 query_create_transcripts_table_export = f""" 10109 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10110 """ 10111 self.execute_query(query=query_create_transcripts_table_export) 10112 10113 # Output file format 10114 transcripts_export_output_format = get_file_format( 10115 filename=transcripts_export_output 10116 ) 10117 10118 # Format VCF - construct INFO 10119 if transcripts_export_output_format in ["vcf"]: 10120 10121 # Construct query update INFO and header 10122 query_update_info = [] 10123 for field in transcripts_annotations_list: 10124 10125 # If field not in header 10126 if field not in self.get_header_infos_list(): 10127 10128 # Add PZ Transcript in header 10129 self.get_header().infos[field] = vcf.parser._Info( 10130 field, 10131 ".", 10132 "String", 10133 f"Annotation '{field}' from transcript view", 10134 "unknown", 10135 "unknown", 10136 0, 10137 ) 10138 10139 # Add field as INFO/tag 10140 query_update_info.append( 10141 f""" 10142 CASE 10143 WHEN "{field}" IS NOT NULL 10144 THEN concat('{field}=', "{field}", ';') 10145 ELSE '' 10146 END 10147 """ 10148 ) 10149 10150 # Query param 10151 query_update_info_value = ( 10152 f""" concat('', {", ".join(query_update_info)}) """ 10153 ) 10154 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10155 10156 else: 10157 10158 # Query param 10159 query_update_info_value = f""" NULL """ 10160 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10161 10162 # Update query INFO column 10163 query_update = f""" 10164 UPDATE {transcripts_table_export} 10165 SET INFO = {query_update_info_value} 10166 10167 """ 10168 self.execute_query(query=query_update) 10169 10170 # Export 10171 self.export_output( 10172 output_file=transcripts_export_output, 10173 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10174 ) 10175 10176 # Drop transcripts export table 10177 query_drop_transcripts_table_export = f""" 10178 DROP TABLE {transcripts_table_export} 10179 """ 10180 self.execute_query(query=query_drop_transcripts_table_export) 10181 10182 def transcripts_prioritization( 10183 self, transcripts_table: str = None, param: dict = {} 10184 ) -> bool: 10185 """ 10186 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10187 and updates the variants table with the prioritized information. 10188 10189 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10190 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10191 This parameter is used to identify the table where the transcripts data is stored for the 10192 prioritization process 10193 :type transcripts_table: str 10194 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10195 that contains various configuration settings for the prioritization process of transcripts. It 10196 is used to customize the behavior of the prioritization algorithm and includes settings such as 10197 the prefix for prioritization fields, default profiles, and other 10198 :type param: dict 10199 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10200 transcripts prioritization process is successfully completed, and `False` if there are any 10201 issues or if no profile is defined for transcripts prioritization. 10202 """ 10203 10204 log.debug("Start transcripts prioritization...") 10205 10206 # Param 10207 if not param: 10208 param = self.get_param() 10209 10210 # Variants table 10211 table_variants = self.get_table_variants() 10212 10213 # Transcripts table 10214 if transcripts_table is None: 10215 transcripts_table = self.create_transcript_view( 10216 transcripts_table="transcripts", param=param 10217 ) 10218 if transcripts_table is None: 10219 msg_err = "No Transcripts table availalble" 10220 log.error(msg_err) 10221 raise ValueError(msg_err) 10222 log.debug(f"transcripts_table={transcripts_table}") 10223 10224 # Get transcripts columns 10225 columns_as_list_query = f""" 10226 DESCRIBE {transcripts_table} 10227 """ 10228 columns_as_list = list( 10229 self.get_query_to_df(columns_as_list_query)["column_name"] 10230 ) 10231 10232 # Create INFO if not exists 10233 if "INFO" not in columns_as_list: 10234 query_add_info = f""" 10235 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10236 """ 10237 self.execute_query(query_add_info) 10238 10239 # Prioritization param and Force only PZ Score and Flag 10240 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10241 10242 # PZ profile by default 10243 pz_profile_default = ( 10244 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10245 ) 10246 10247 # Exit if no profile 10248 if pz_profile_default is None: 10249 log.warning("No profile defined for transcripts prioritization") 10250 return False 10251 10252 # PZ fields 10253 pz_param_pzfields = {} 10254 10255 # PZ field transcripts 10256 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10257 10258 # Add PZ Transcript in header 10259 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10260 pz_fields_transcripts, 10261 ".", 10262 "String", 10263 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10264 "unknown", 10265 "unknown", 10266 code_type_map["String"], 10267 ) 10268 10269 # Mandatory fields 10270 pz_mandatory_fields_list = [ 10271 "Score", 10272 "Flag", 10273 "Tags", 10274 "Comment", 10275 "Infos", 10276 "Class", 10277 ] 10278 pz_mandatory_fields = [] 10279 for pz_mandatory_field in pz_mandatory_fields_list: 10280 pz_mandatory_fields.append( 10281 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10282 ) 10283 10284 # PZ fields in param 10285 for pz_field in pz_param.get("pzfields", []): 10286 if pz_field in pz_mandatory_fields_list: 10287 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10288 pz_param.get("pzprefix", "PTZ") + pz_field 10289 ) 10290 else: 10291 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10292 pz_param_pzfields[pz_field] = pz_field_new 10293 10294 # Add PZ Transcript in header 10295 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10296 pz_field_new, 10297 ".", 10298 "String", 10299 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10300 "unknown", 10301 "unknown", 10302 code_type_map["String"], 10303 ) 10304 10305 # PZ fields param 10306 pz_param["pzfields"] = pz_mandatory_fields 10307 10308 # Prioritization 10309 prioritization_result = self.prioritization( 10310 table=transcripts_table, 10311 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10312 ) 10313 if not prioritization_result: 10314 log.warning("Transcripts prioritization not processed") 10315 return False 10316 10317 # PZ fields sql query 10318 query_update_select_list = [] 10319 query_update_concat_list = [] 10320 query_update_order_list = [] 10321 for pz_param_pzfield in set( 10322 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10323 ): 10324 query_update_select_list.append(f" {pz_param_pzfield}, ") 10325 10326 for pz_param_pzfield in pz_param_pzfields: 10327 query_update_concat_list.append( 10328 f""" 10329 , CASE 10330 WHEN {pz_param_pzfield} IS NOT NULL 10331 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10332 ELSE '' 10333 END 10334 """ 10335 ) 10336 10337 # Order by 10338 pz_orders = ( 10339 param.get("transcripts", {}) 10340 .get("prioritization", {}) 10341 .get("prioritization_transcripts_order", {}) 10342 ) 10343 if not pz_orders: 10344 pz_orders = { 10345 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10346 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10347 } 10348 for pz_order in pz_orders: 10349 query_update_order_list.append( 10350 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10351 ) 10352 10353 # Fields to explode 10354 fields_to_explode = ( 10355 list(pz_param_pzfields.keys()) 10356 + pz_mandatory_fields 10357 + list(pz_orders.keys()) 10358 ) 10359 # Remove transcript column as a specific transcript column 10360 if "transcript" in fields_to_explode: 10361 fields_to_explode.remove("transcript") 10362 10363 # Fields intranscripts table 10364 query_transcripts_table = f""" 10365 DESCRIBE SELECT * FROM {transcripts_table} 10366 """ 10367 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10368 10369 # Check fields to explode 10370 for field_to_explode in fields_to_explode: 10371 if field_to_explode not in self.get_header_infos_list() + list( 10372 query_transcripts_table.column_name 10373 ): 10374 msg_err = f"INFO/{field_to_explode} NOT IN header" 10375 log.error(msg_err) 10376 raise ValueError(msg_err) 10377 10378 # Explode fields to explode 10379 self.explode_infos( 10380 table=transcripts_table, 10381 fields=fields_to_explode, 10382 ) 10383 10384 # Transcript preference file 10385 transcripts_preference_file = ( 10386 param.get("transcripts", {}) 10387 .get("prioritization", {}) 10388 .get("prioritization_transcripts", {}) 10389 ) 10390 transcripts_preference_file = full_path(transcripts_preference_file) 10391 10392 # Transcript preference forced 10393 transcript_preference_force = ( 10394 param.get("transcripts", {}) 10395 .get("prioritization", {}) 10396 .get("prioritization_transcripts_force", False) 10397 ) 10398 # Transcript version forced 10399 transcript_version_force = ( 10400 param.get("transcripts", {}) 10401 .get("prioritization", {}) 10402 .get("prioritization_transcripts_version_force", False) 10403 ) 10404 10405 # Transcripts Ranking 10406 if transcripts_preference_file: 10407 10408 # Transcripts file to dataframe 10409 if os.path.exists(transcripts_preference_file): 10410 transcripts_preference_dataframe = transcripts_file_to_df( 10411 transcripts_preference_file 10412 ) 10413 else: 10414 log.error( 10415 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10416 ) 10417 raise ValueError( 10418 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10419 ) 10420 10421 # Order by depending to transcript preference forcing 10422 if transcript_preference_force: 10423 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10424 else: 10425 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10426 10427 # Transcript columns joined depend on version consideration 10428 if transcript_version_force: 10429 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10430 else: 10431 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10432 10433 # Query ranking for update 10434 query_update_ranking = f""" 10435 SELECT 10436 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10437 ROW_NUMBER() OVER ( 10438 PARTITION BY "#CHROM", POS, REF, ALT 10439 ORDER BY {order_by} 10440 ) AS rn 10441 FROM {transcripts_table} 10442 LEFT JOIN 10443 ( 10444 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10445 FROM transcripts_preference_dataframe 10446 ) AS transcripts_preference 10447 ON {transcripts_version_join} 10448 """ 10449 10450 else: 10451 10452 # Query ranking for update 10453 query_update_ranking = f""" 10454 SELECT 10455 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10456 ROW_NUMBER() OVER ( 10457 PARTITION BY "#CHROM", POS, REF, ALT 10458 ORDER BY {" , ".join(query_update_order_list)} 10459 ) AS rn 10460 FROM {transcripts_table} 10461 """ 10462 10463 # Export Transcripts prioritization infos to variants table 10464 query_update = f""" 10465 WITH RankedTranscripts AS ( 10466 {query_update_ranking} 10467 ) 10468 UPDATE {table_variants} 10469 SET 10470 INFO = CONCAT(CASE 10471 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10472 THEN '' 10473 ELSE concat("INFO", ';') 10474 END, 10475 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10476 ) 10477 FROM 10478 RankedTranscripts 10479 WHERE 10480 rn = 1 10481 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10482 AND variants."POS" = RankedTranscripts."POS" 10483 AND variants."REF" = RankedTranscripts."REF" 10484 AND variants."ALT" = RankedTranscripts."ALT" 10485 """ 10486 10487 # log.debug(f"query_update={query_update}") 10488 self.execute_query(query=query_update) 10489 10490 # Return 10491 return True 10492 10493 def create_transcript_view_from_columns_map( 10494 self, 10495 transcripts_table: str = "transcripts", 10496 columns_maps: dict = {}, 10497 added_columns: list = [], 10498 temporary_tables: list = None, 10499 annotation_fields: list = None, 10500 column_rename: dict = {}, 10501 column_clean: bool = False, 10502 column_case: str = None, 10503 ) -> tuple[list, list, list]: 10504 """ 10505 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10506 specified columns mapping for transcripts data. 10507 10508 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10509 of the table where the transcripts data is stored or will be stored in the database. This table 10510 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10511 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10512 :type transcripts_table: str (optional) 10513 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10514 about how to map columns from a transcripts table to create a view. Each entry in the 10515 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10516 typically includes details such as the main transcript column and additional information columns 10517 :type columns_maps: dict 10518 :param added_columns: The `added_columns` parameter in the 10519 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10520 that will be added to the view being created based on the columns map provided. These columns 10521 are generated by exploding the transcript information columns along with the main transcript 10522 column 10523 :type added_columns: list 10524 :param temporary_tables: The `temporary_tables` parameter in the 10525 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10526 tables created during the process of creating a transcript view from a columns map. These 10527 temporary tables are used to store intermediate results or transformations before the final view 10528 is generated 10529 :type temporary_tables: list 10530 :param annotation_fields: The `annotation_fields` parameter in the 10531 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10532 used for annotation in the query view creation process. These fields are extracted from the 10533 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10534 :type annotation_fields: list 10535 :param column_rename: The `column_rename` parameter in the 10536 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10537 custom renaming for columns during the creation of the temporary table view. This parameter 10538 provides a mapping of original column names to the desired renamed column names. By using this 10539 parameter, 10540 :type column_rename: dict 10541 :param column_clean: The `column_clean` parameter in the 10542 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10543 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10544 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10545 False 10546 :type column_clean: bool (optional) 10547 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10548 function is used to specify the case transformation to be applied to the columns during the view 10549 creation process. It allows you to control whether the column values should be converted to 10550 lowercase, uppercase, or remain unchanged 10551 :type column_case: str 10552 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10553 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10554 """ 10555 10556 log.debug("Start transcrpts view creation from columns map...") 10557 10558 # "from_columns_map": [ 10559 # { 10560 # "transcripts_column": "Ensembl_transcriptid", 10561 # "transcripts_infos_columns": [ 10562 # "genename", 10563 # "Ensembl_geneid", 10564 # "LIST_S2_score", 10565 # "LIST_S2_pred", 10566 # ], 10567 # }, 10568 # { 10569 # "transcripts_column": "Ensembl_transcriptid", 10570 # "transcripts_infos_columns": [ 10571 # "genename", 10572 # "VARITY_R_score", 10573 # "Aloft_pred", 10574 # ], 10575 # }, 10576 # ], 10577 10578 # Init 10579 if temporary_tables is None: 10580 temporary_tables = [] 10581 if annotation_fields is None: 10582 annotation_fields = [] 10583 10584 # Variants table 10585 table_variants = self.get_table_variants() 10586 10587 for columns_map in columns_maps: 10588 10589 # Transcript column 10590 transcripts_column = columns_map.get("transcripts_column", None) 10591 10592 # Transcripts infos columns 10593 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10594 10595 # Transcripts infos columns rename 10596 column_rename = columns_map.get("column_rename", column_rename) 10597 10598 # Transcripts infos columns clean 10599 column_clean = columns_map.get("column_clean", column_clean) 10600 10601 # Transcripts infos columns case 10602 column_case = columns_map.get("column_case", column_case) 10603 10604 if transcripts_column is not None: 10605 10606 # Explode 10607 added_columns += self.explode_infos( 10608 fields=[transcripts_column] + transcripts_infos_columns 10609 ) 10610 10611 # View clauses 10612 clause_select_variants = [] 10613 clause_select_tanscripts = [] 10614 for field in [transcripts_column] + transcripts_infos_columns: 10615 10616 # AS field 10617 as_field = field 10618 10619 # Rename 10620 if column_rename: 10621 as_field = column_rename.get(as_field, as_field) 10622 10623 # Clean 10624 if column_clean: 10625 as_field = clean_annotation_field(as_field) 10626 10627 # Case 10628 if column_case: 10629 if column_case.lower() in ["lower"]: 10630 as_field = as_field.lower() 10631 elif column_case.lower() in ["upper"]: 10632 as_field = as_field.upper() 10633 10634 # Clause select Variants 10635 clause_select_variants.append( 10636 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10637 ) 10638 10639 if field in [transcripts_column]: 10640 clause_select_tanscripts.append( 10641 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10642 ) 10643 else: 10644 clause_select_tanscripts.append( 10645 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10646 ) 10647 annotation_fields.append(as_field) 10648 10649 # Querey View 10650 query = f""" 10651 SELECT 10652 "#CHROM", POS, REF, ALT, INFO, 10653 "{transcripts_column}" AS 'transcript', 10654 {", ".join(clause_select_tanscripts)} 10655 FROM ( 10656 SELECT 10657 "#CHROM", POS, REF, ALT, INFO, 10658 {", ".join(clause_select_variants)} 10659 FROM {table_variants} 10660 ) 10661 WHERE "{transcripts_column}" IS NOT NULL 10662 """ 10663 10664 # Create temporary table 10665 temporary_table = transcripts_table + "".join( 10666 random.choices(string.ascii_uppercase + string.digits, k=10) 10667 ) 10668 10669 # Temporary_tables 10670 temporary_tables.append(temporary_table) 10671 query_view = f""" 10672 CREATE TEMPORARY TABLE {temporary_table} 10673 AS ({query}) 10674 """ 10675 self.execute_query(query=query_view) 10676 10677 return added_columns, temporary_tables, annotation_fields 10678 10679 def create_transcript_view_from_column_format( 10680 self, 10681 transcripts_table: str = "transcripts", 10682 column_formats: dict = {}, 10683 temporary_tables: list = None, 10684 annotation_fields: list = None, 10685 column_rename: dict = {}, 10686 column_clean: bool = False, 10687 column_case: str = None, 10688 ) -> tuple[list, list, list]: 10689 """ 10690 The `create_transcript_view_from_column_format` function generates a transcript view based on 10691 specified column formats, adds additional columns and annotation fields, and returns the list of 10692 temporary tables and annotation fields. 10693 10694 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10695 of the table containing the transcripts data. This table will be used as the base table for 10696 creating the transcript view. The default value for this parameter is "transcripts", but you can 10697 provide a different table name if needed, defaults to transcripts 10698 :type transcripts_table: str (optional) 10699 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10700 about the columns to be used for creating the transcript view. Each entry in the dictionary 10701 specifies the mapping between a transcripts column and a transcripts infos column. This 10702 parameter allows you to define how the columns from the transcripts table should be transformed 10703 or mapped 10704 :type column_formats: dict 10705 :param temporary_tables: The `temporary_tables` parameter in the 10706 `create_transcript_view_from_column_format` function is a list that stores the names of 10707 temporary views created during the process of creating a transcript view from a column format. 10708 These temporary views are used to manipulate and extract data before generating the final 10709 transcript view 10710 :type temporary_tables: list 10711 :param annotation_fields: The `annotation_fields` parameter in the 10712 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10713 that are extracted from the temporary views created during the process. These annotation fields 10714 are obtained by querying the temporary views and extracting the column names excluding specific 10715 columns like `#CH 10716 :type annotation_fields: list 10717 :param column_rename: The `column_rename` parameter in the 10718 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10719 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10720 column names to new column names in this dictionary, you can rename specific columns during the 10721 process 10722 :type column_rename: dict 10723 :param column_clean: The `column_clean` parameter in the 10724 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10725 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10726 will be cleaned during the creation of the transcript view based on the specified column format, 10727 defaults to False 10728 :type column_clean: bool (optional) 10729 :param column_case: The `column_case` parameter in the 10730 `create_transcript_view_from_column_format` function is used to specify the case transformation 10731 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10732 to convert the column names to uppercase or lowercase, respectively 10733 :type column_case: str 10734 :return: The `create_transcript_view_from_column_format` function returns two lists: 10735 `temporary_tables` and `annotation_fields`. 10736 """ 10737 10738 log.debug("Start transcrpts view creation from column format...") 10739 10740 # "from_column_format": [ 10741 # { 10742 # "transcripts_column": "ANN", 10743 # "transcripts_infos_column": "Feature_ID", 10744 # } 10745 # ], 10746 10747 # Init 10748 if temporary_tables is None: 10749 temporary_tables = [] 10750 if annotation_fields is None: 10751 annotation_fields = [] 10752 10753 for column_format in column_formats: 10754 10755 # annotation field and transcript annotation field 10756 annotation_field = column_format.get("transcripts_column", "ANN") 10757 transcript_annotation = column_format.get( 10758 "transcripts_infos_column", "Feature_ID" 10759 ) 10760 10761 # Transcripts infos columns rename 10762 column_rename = column_format.get("column_rename", column_rename) 10763 10764 # Transcripts infos columns clean 10765 column_clean = column_format.get("column_clean", column_clean) 10766 10767 # Transcripts infos columns case 10768 column_case = column_format.get("column_case", column_case) 10769 10770 # Temporary View name 10771 temporary_view_name = transcripts_table + "".join( 10772 random.choices(string.ascii_uppercase + string.digits, k=10) 10773 ) 10774 10775 # Create temporary view name 10776 temporary_view_name = self.annotation_format_to_table( 10777 uniquify=True, 10778 annotation_field=annotation_field, 10779 view_name=temporary_view_name, 10780 annotation_id=transcript_annotation, 10781 column_rename=column_rename, 10782 column_clean=column_clean, 10783 column_case=column_case, 10784 ) 10785 10786 # Annotation fields 10787 if temporary_view_name: 10788 query_annotation_fields = f""" 10789 SELECT * 10790 FROM ( 10791 DESCRIBE SELECT * 10792 FROM {temporary_view_name} 10793 ) 10794 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10795 """ 10796 df_annotation_fields = self.get_query_to_df( 10797 query=query_annotation_fields 10798 ) 10799 10800 # Add temporary view and annotation fields 10801 temporary_tables.append(temporary_view_name) 10802 annotation_fields += list(set(df_annotation_fields["column_name"])) 10803 10804 return temporary_tables, annotation_fields 10805 10806 def create_transcript_view( 10807 self, 10808 transcripts_table: str = None, 10809 transcripts_table_drop: bool = False, 10810 param: dict = {}, 10811 ) -> str: 10812 """ 10813 The `create_transcript_view` function generates a transcript view by processing data from a 10814 specified table based on provided parameters and structural information. 10815 10816 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10817 is used to specify the name of the table that will store the final transcript view data. If a table 10818 name is not provided, the function will create a new table to store the transcript view data, and by 10819 default,, defaults to transcripts 10820 :type transcripts_table: str (optional) 10821 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10822 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10823 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10824 the function will drop the existing transcripts table if it exists, defaults to False 10825 :type transcripts_table_drop: bool (optional) 10826 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10827 contains information needed to create a transcript view. It includes details such as the structure 10828 of the transcripts, columns mapping, column formats, and other necessary information for generating 10829 the view. This parameter allows for flexibility and customization 10830 :type param: dict 10831 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10832 created or modified during the execution of the function. 10833 """ 10834 10835 log.debug("Start transcripts view creation...") 10836 10837 # Default 10838 transcripts_table_default = "transcripts" 10839 10840 # Param 10841 if not param: 10842 param = self.get_param() 10843 10844 # Struct 10845 struct = param.get("transcripts", {}).get("struct", None) 10846 10847 # Transcript veresion 10848 transcript_id_remove_version = param.get("transcripts", {}).get( 10849 "transcript_id_remove_version", False 10850 ) 10851 10852 # Transcripts mapping 10853 transcript_id_mapping_file = param.get("transcripts", {}).get( 10854 "transcript_id_mapping_file", None 10855 ) 10856 10857 # Transcripts mapping 10858 transcript_id_mapping_force = param.get("transcripts", {}).get( 10859 "transcript_id_mapping_force", None 10860 ) 10861 10862 if struct: 10863 10864 # Transcripts table 10865 if transcripts_table is None: 10866 transcripts_table = param.get("transcripts", {}).get( 10867 "table", transcripts_table_default 10868 ) 10869 10870 # added_columns 10871 added_columns = [] 10872 10873 # Temporary tables 10874 temporary_tables = [] 10875 10876 # Annotation fields 10877 annotation_fields = [] 10878 10879 # from columns map 10880 columns_maps = struct.get("from_columns_map", []) 10881 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10882 self.create_transcript_view_from_columns_map( 10883 transcripts_table=transcripts_table, 10884 columns_maps=columns_maps, 10885 added_columns=added_columns, 10886 temporary_tables=temporary_tables, 10887 annotation_fields=annotation_fields, 10888 ) 10889 ) 10890 added_columns += added_columns_tmp 10891 temporary_tables += temporary_tables_tmp 10892 annotation_fields += annotation_fields_tmp 10893 10894 # from column format 10895 column_formats = struct.get("from_column_format", []) 10896 temporary_tables_tmp, annotation_fields_tmp = ( 10897 self.create_transcript_view_from_column_format( 10898 transcripts_table=transcripts_table, 10899 column_formats=column_formats, 10900 temporary_tables=temporary_tables, 10901 annotation_fields=annotation_fields, 10902 ) 10903 ) 10904 temporary_tables += temporary_tables_tmp 10905 annotation_fields += annotation_fields_tmp 10906 10907 # Remove some specific fields/column 10908 annotation_fields = list(set(annotation_fields)) 10909 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10910 if field in annotation_fields: 10911 annotation_fields.remove(field) 10912 10913 # Merge temporary tables query 10914 query_merge = "" 10915 for temporary_table in list(set(temporary_tables)): 10916 10917 # First temporary table 10918 if not query_merge: 10919 query_merge = f""" 10920 SELECT * FROM {temporary_table} 10921 """ 10922 # other temporary table (using UNION) 10923 else: 10924 query_merge += f""" 10925 UNION BY NAME SELECT * FROM {temporary_table} 10926 """ 10927 10928 # transcript table tmp 10929 transcript_table_tmp = "transcripts_tmp" 10930 transcript_table_tmp2 = "transcripts_tmp2" 10931 transcript_table_tmp3 = "transcripts_tmp3" 10932 10933 # Merge on transcript 10934 query_merge_on_transcripts_annotation_fields = [] 10935 10936 # Add transcript list 10937 query_merge_on_transcripts_annotation_fields.append( 10938 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10939 ) 10940 10941 # Aggregate all annotations fields 10942 for annotation_field in set(annotation_fields): 10943 query_merge_on_transcripts_annotation_fields.append( 10944 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10945 ) 10946 10947 # Transcripts mapping 10948 if transcript_id_mapping_file: 10949 10950 # Transcript dataframe 10951 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10952 transcript_id_mapping_dataframe = transcripts_file_to_df( 10953 transcript_id_mapping_file, column_names=["transcript", "alias"] 10954 ) 10955 10956 # Transcript version remove 10957 if transcript_id_remove_version: 10958 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10959 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10960 query_left_join = f""" 10961 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10962 """ 10963 else: 10964 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10965 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10966 query_left_join = f""" 10967 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10968 """ 10969 10970 # Transcript column for group by merge 10971 query_transcript_merge_group_by = """ 10972 CASE 10973 WHEN transcript_mapped NOT IN ('') 10974 THEN split_part(transcript_mapped, '.', 1) 10975 ELSE split_part(transcript_original, '.', 1) 10976 END 10977 """ 10978 10979 # Merge query 10980 transcripts_tmp2_query = f""" 10981 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10982 FROM ({query_merge}) AS {transcript_table_tmp} 10983 {query_left_join} 10984 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10985 """ 10986 10987 # Retrive columns after mege 10988 transcripts_tmp2_describe_query = f""" 10989 DESCRIBE {transcripts_tmp2_query} 10990 """ 10991 transcripts_tmp2_describe_list = list( 10992 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10993 "column_name" 10994 ] 10995 ) 10996 10997 # Create list of columns for select clause 10998 transcripts_tmp2_describe_select_clause = [] 10999 for field in transcripts_tmp2_describe_list: 11000 if field not in [ 11001 "#CHROM", 11002 "POS", 11003 "REF", 11004 "ALT", 11005 "INFO", 11006 "transcript_mapped", 11007 ]: 11008 as_field = field 11009 if field in ["transcript_original"]: 11010 as_field = "transcripts_mapped" 11011 transcripts_tmp2_describe_select_clause.append( 11012 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11013 ) 11014 11015 # Merge with mapping 11016 query_merge_on_transcripts = f""" 11017 SELECT 11018 "#CHROM", POS, REF, ALT, INFO, 11019 CASE 11020 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11021 THEN ANY_VALUE(transcript_mapped) 11022 ELSE ANY_VALUE(transcript_original) 11023 END AS transcript, 11024 {", ".join(transcripts_tmp2_describe_select_clause)} 11025 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11026 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11027 {query_transcript_merge_group_by} 11028 """ 11029 11030 # Add transcript filter from mapping file 11031 if transcript_id_mapping_force: 11032 query_merge_on_transcripts = f""" 11033 SELECT * 11034 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11035 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11036 """ 11037 11038 # No transcript mapping 11039 else: 11040 11041 # Remove transcript version 11042 if transcript_id_remove_version: 11043 query_transcript_column = f""" 11044 split_part({transcript_table_tmp}.transcript, '.', 1) 11045 """ 11046 else: 11047 query_transcript_column = """ 11048 transcript 11049 """ 11050 11051 # Query sections 11052 query_transcript_column_select = ( 11053 f"{query_transcript_column} AS transcript" 11054 ) 11055 query_transcript_column_group_by = query_transcript_column 11056 11057 # Query for transcripts view 11058 query_merge_on_transcripts = f""" 11059 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11060 FROM ({query_merge}) AS {transcript_table_tmp} 11061 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11062 """ 11063 11064 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11065 11066 # Drop transcript view is necessary 11067 if transcripts_table_drop: 11068 query_drop = f""" 11069 DROP TABLE IF EXISTS {transcripts_table}; 11070 """ 11071 self.execute_query(query=query_drop) 11072 11073 # Merge and create transcript view 11074 query_create_view = f""" 11075 CREATE TABLE IF NOT EXISTS {transcripts_table} 11076 AS {query_merge_on_transcripts} 11077 """ 11078 self.execute_query(query=query_create_view) 11079 11080 # Remove added columns 11081 for added_column in added_columns: 11082 self.drop_column(column=added_column) 11083 11084 else: 11085 11086 transcripts_table = None 11087 11088 return transcripts_table 11089 11090 def annotation_format_to_table( 11091 self, 11092 uniquify: bool = True, 11093 annotation_field: str = "ANN", 11094 annotation_id: str = "Feature_ID", 11095 view_name: str = "transcripts", 11096 column_rename: dict = {}, 11097 column_clean: bool = False, 11098 column_case: str = None, 11099 ) -> str: 11100 """ 11101 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11102 structured table format, ensuring unique values and creating a temporary table for further 11103 processing or analysis. 11104 11105 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11106 unique values in the output or not. If set to `True`, the function will make sure that the 11107 output values are unique, defaults to True 11108 :type uniquify: bool (optional) 11109 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11110 that contains the annotation information for each variant. This field is used to extract the 11111 annotation details for further processing in the function. By default, it is set to "ANN", 11112 defaults to ANN 11113 :type annotation_field: str (optional) 11114 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11115 is used to specify the identifier for the annotation feature. This identifier will be used as a 11116 column name in the resulting table or view that is created based on the annotation data. It 11117 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11118 :type annotation_id: str (optional) 11119 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11120 to specify the name of the temporary table that will be created to store the transformed 11121 annotation data. This table will hold the extracted information from the annotation field in a 11122 structured format for further processing or analysis. By default,, defaults to transcripts 11123 :type view_name: str (optional) 11124 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11125 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11126 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11127 created based on the annotation data. This feature enables 11128 :type column_rename: dict 11129 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11130 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11131 If set to `True`, the function will clean the annotation field before further processing. This 11132 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11133 to False 11134 :type column_clean: bool (optional) 11135 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11136 used to specify the case transformation to be applied to the column names extracted from the 11137 annotation data. It allows you to set the case of the column names to either lowercase or 11138 uppercase for consistency or other specific requirements during the conversion 11139 :type column_case: str 11140 :return: The function `annotation_format_to_table` is returning the name of the view created, 11141 which is stored in the variable `view_name`. 11142 """ 11143 11144 # Annotation field 11145 annotation_format = "annotation_explode" 11146 11147 # Transcript annotation 11148 if column_rename: 11149 annotation_id = column_rename.get(annotation_id, annotation_id) 11150 11151 if column_clean: 11152 annotation_id = clean_annotation_field(annotation_id) 11153 11154 # Prefix 11155 prefix = self.get_explode_infos_prefix() 11156 if prefix: 11157 prefix = "INFO/" 11158 11159 # Annotation fields 11160 annotation_infos = prefix + annotation_field 11161 annotation_format_infos = prefix + annotation_format 11162 11163 # Variants table 11164 table_variants = self.get_table_variants() 11165 11166 # Header 11167 vcf_reader = self.get_header() 11168 11169 # Add columns 11170 added_columns = [] 11171 11172 # Explode HGVS field in column 11173 added_columns += self.explode_infos(fields=[annotation_field]) 11174 11175 if annotation_field in vcf_reader.infos: 11176 11177 # Extract ANN header 11178 ann_description = vcf_reader.infos[annotation_field].desc 11179 pattern = r"'(.+?)'" 11180 match = re.search(pattern, ann_description) 11181 if match: 11182 ann_header_match = match.group(1).split(" | ") 11183 ann_header = [] 11184 ann_header_desc = {} 11185 for i in range(len(ann_header_match)): 11186 ann_header_info = "".join( 11187 char for char in ann_header_match[i] if char.isalnum() 11188 ) 11189 ann_header.append(ann_header_info) 11190 ann_header_desc[ann_header_info] = ann_header_match[i] 11191 if not ann_header_desc: 11192 raise ValueError("Invalid header description format") 11193 else: 11194 raise ValueError("Invalid header description format") 11195 11196 # Create variant id 11197 variant_id_column = self.get_variant_id_column() 11198 added_columns += [variant_id_column] 11199 11200 # Create dataframe 11201 dataframe_annotation_format = self.get_query_to_df( 11202 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11203 ) 11204 11205 # Create annotation columns 11206 dataframe_annotation_format[ 11207 annotation_format_infos 11208 ] = dataframe_annotation_format[annotation_infos].apply( 11209 lambda x: explode_annotation_format( 11210 annotation=str(x), 11211 uniquify=uniquify, 11212 output_format="JSON", 11213 prefix="", 11214 header=list(ann_header_desc.values()), 11215 ) 11216 ) 11217 11218 # Find keys 11219 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11220 df_keys = self.get_query_to_df(query=query_json) 11221 11222 # Check keys 11223 query_json_key = [] 11224 for _, row in df_keys.iterrows(): 11225 11226 # Key 11227 key = row.iloc[0] 11228 key_clean = key 11229 11230 # key rename 11231 if column_rename: 11232 key_clean = column_rename.get(key_clean, key_clean) 11233 11234 # key clean 11235 if column_clean: 11236 key_clean = clean_annotation_field(key_clean) 11237 11238 # Key case 11239 if column_case: 11240 if column_case.lower() in ["lower"]: 11241 key_clean = key_clean.lower() 11242 elif column_case.lower() in ["upper"]: 11243 key_clean = key_clean.upper() 11244 11245 # Type 11246 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11247 11248 # Get DataFrame from query 11249 df_json_type = self.get_query_to_df(query=query_json_type) 11250 11251 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11252 with pd.option_context("future.no_silent_downcasting", True): 11253 df_json_type.fillna(value="", inplace=True) 11254 replace_dict = {None: np.nan, "": np.nan} 11255 df_json_type.replace(replace_dict, inplace=True) 11256 df_json_type.dropna(inplace=True) 11257 11258 # Detect column type 11259 column_type = detect_column_type(df_json_type[key_clean]) 11260 11261 # Append 11262 query_json_key.append( 11263 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11264 ) 11265 11266 # Create view 11267 query_view = f""" 11268 CREATE TEMPORARY TABLE {view_name} 11269 AS ( 11270 SELECT *, {annotation_id} AS 'transcript' 11271 FROM ( 11272 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11273 FROM dataframe_annotation_format 11274 ) 11275 ); 11276 """ 11277 self.execute_query(query=query_view) 11278 11279 else: 11280 11281 # Return None 11282 view_name = None 11283 11284 # Remove added columns 11285 for added_column in added_columns: 11286 self.drop_column(column=added_column) 11287 11288 return view_name 11289 11290 def transcript_view_to_variants( 11291 self, 11292 transcripts_table: str = None, 11293 transcripts_column_id: str = None, 11294 transcripts_info_json: str = None, 11295 transcripts_info_field_json: str = None, 11296 transcripts_info_format: str = None, 11297 transcripts_info_field_format: str = None, 11298 param: dict = {}, 11299 ) -> bool: 11300 """ 11301 The `transcript_view_to_variants` function updates a variants table with information from 11302 transcripts in JSON format. 11303 11304 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11305 table containing the transcripts data. If this parameter is not provided, the function will 11306 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11307 :type transcripts_table: str 11308 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11309 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11310 identifier is used to match transcripts with variants in the database 11311 :type transcripts_column_id: str 11312 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11313 of the column in the variants table where the transcripts information will be stored in JSON 11314 format. This parameter allows you to define the column in the variants table that will hold the 11315 JSON-formatted information about transcripts 11316 :type transcripts_info_json: str 11317 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11318 specify the field in the VCF header that will contain information about transcripts in JSON 11319 format. This field will be added to the VCF header as an INFO field with the specified name 11320 :type transcripts_info_field_json: str 11321 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11322 format of the information about transcripts that will be stored in the variants table. This 11323 format can be used to define how the transcript information will be structured or displayed 11324 within the variants table 11325 :type transcripts_info_format: str 11326 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11327 specify the field in the VCF header that will contain information about transcripts in a 11328 specific format. This field will be added to the VCF header as an INFO field with the specified 11329 name 11330 :type transcripts_info_field_format: str 11331 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11332 that contains various configuration settings related to transcripts. It is used to provide 11333 default values for certain parameters if they are not explicitly provided when calling the 11334 method. The `param` dictionary can be passed as an argument 11335 :type param: dict 11336 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11337 if the operation is successful and `False` if certain conditions are not met. 11338 """ 11339 11340 msg_info_prefix = "Start transcripts view to variants annotations" 11341 11342 log.debug(f"{msg_info_prefix}...") 11343 11344 # Default 11345 transcripts_table_default = "transcripts" 11346 transcripts_column_id_default = "transcript" 11347 transcripts_info_json_default = None 11348 transcripts_info_format_default = None 11349 transcripts_info_field_json_default = None 11350 transcripts_info_field_format_default = None 11351 11352 # Param 11353 if not param: 11354 param = self.get_param() 11355 11356 # Transcripts table 11357 if transcripts_table is None: 11358 transcripts_table = param.get("transcripts", {}).get( 11359 "table", transcripts_table_default 11360 ) 11361 11362 # Transcripts column ID 11363 if transcripts_column_id is None: 11364 transcripts_column_id = param.get("transcripts", {}).get( 11365 "column_id", transcripts_column_id_default 11366 ) 11367 11368 # Transcripts info json 11369 if transcripts_info_json is None: 11370 transcripts_info_json = param.get("transcripts", {}).get( 11371 "transcripts_info_json", transcripts_info_json_default 11372 ) 11373 11374 # Transcripts info field JSON 11375 if transcripts_info_field_json is None: 11376 transcripts_info_field_json = param.get("transcripts", {}).get( 11377 "transcripts_info_field_json", transcripts_info_field_json_default 11378 ) 11379 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11380 # transcripts_info_json = transcripts_info_field_json 11381 11382 # Transcripts info format 11383 if transcripts_info_format is None: 11384 transcripts_info_format = param.get("transcripts", {}).get( 11385 "transcripts_info_format", transcripts_info_format_default 11386 ) 11387 11388 # Transcripts info field FORMAT 11389 if transcripts_info_field_format is None: 11390 transcripts_info_field_format = param.get("transcripts", {}).get( 11391 "transcripts_info_field_format", transcripts_info_field_format_default 11392 ) 11393 # if ( 11394 # transcripts_info_field_format is not None 11395 # and transcripts_info_format is None 11396 # ): 11397 # transcripts_info_format = transcripts_info_field_format 11398 11399 # Variants table 11400 table_variants = self.get_table_variants() 11401 11402 # Check info columns param 11403 if ( 11404 transcripts_info_json is None 11405 and transcripts_info_field_json is None 11406 and transcripts_info_format is None 11407 and transcripts_info_field_format is None 11408 ): 11409 return False 11410 11411 # Transcripts infos columns 11412 query_transcripts_infos_columns = f""" 11413 SELECT * 11414 FROM ( 11415 DESCRIBE SELECT * FROM {transcripts_table} 11416 ) 11417 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11418 """ 11419 transcripts_infos_columns = list( 11420 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11421 ) 11422 11423 # View results 11424 clause_select = [] 11425 clause_to_json = [] 11426 clause_to_format = [] 11427 for field in transcripts_infos_columns: 11428 # Do not consider INFO field for export into fields 11429 if field not in ["INFO"]: 11430 clause_select.append( 11431 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11432 ) 11433 clause_to_json.append(f""" '{field}': "{field}" """) 11434 clause_to_format.append(f""" "{field}" """) 11435 11436 # Update 11437 update_set_json = [] 11438 update_set_format = [] 11439 11440 # VCF header 11441 vcf_reader = self.get_header() 11442 11443 # Transcripts to info column in JSON 11444 if transcripts_info_json: 11445 11446 # Create column on variants table 11447 self.add_column( 11448 table_name=table_variants, 11449 column_name=transcripts_info_json, 11450 column_type="JSON", 11451 default_value=None, 11452 drop=False, 11453 ) 11454 11455 # Add header 11456 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11457 transcripts_info_json, 11458 ".", 11459 "String", 11460 "Transcripts in JSON format", 11461 "unknwon", 11462 "unknwon", 11463 self.code_type_map["String"], 11464 ) 11465 11466 # Add to update 11467 update_set_json.append( 11468 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11469 ) 11470 11471 # Transcripts to info field in JSON 11472 if transcripts_info_field_json: 11473 11474 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11475 11476 # Add to update 11477 update_set_json.append( 11478 f""" 11479 INFO = concat( 11480 CASE 11481 WHEN INFO NOT IN ('', '.') 11482 THEN INFO 11483 ELSE '' 11484 END, 11485 CASE 11486 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11487 THEN concat( 11488 ';{transcripts_info_field_json}=', 11489 t.{transcripts_info_json} 11490 ) 11491 ELSE '' 11492 END 11493 ) 11494 """ 11495 ) 11496 11497 # Add header 11498 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11499 transcripts_info_field_json, 11500 ".", 11501 "String", 11502 "Transcripts in JSON format", 11503 "unknwon", 11504 "unknwon", 11505 self.code_type_map["String"], 11506 ) 11507 11508 if update_set_json: 11509 11510 # Update query 11511 query_update = f""" 11512 UPDATE {table_variants} 11513 SET {", ".join(update_set_json)} 11514 FROM 11515 ( 11516 SELECT 11517 "#CHROM", POS, REF, ALT, 11518 concat( 11519 '{{', 11520 string_agg( 11521 '"' || "{transcripts_column_id}" || '":' || 11522 to_json(json_output) 11523 ), 11524 '}}' 11525 )::JSON AS {transcripts_info_json} 11526 FROM 11527 ( 11528 SELECT 11529 "#CHROM", POS, REF, ALT, 11530 "{transcripts_column_id}", 11531 to_json( 11532 {{{",".join(clause_to_json)}}} 11533 )::JSON AS json_output 11534 FROM 11535 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11536 WHERE "{transcripts_column_id}" IS NOT NULL 11537 ) 11538 GROUP BY "#CHROM", POS, REF, ALT 11539 ) AS t 11540 WHERE {table_variants}."#CHROM" = t."#CHROM" 11541 AND {table_variants}."POS" = t."POS" 11542 AND {table_variants}."REF" = t."REF" 11543 AND {table_variants}."ALT" = t."ALT" 11544 """ 11545 11546 self.execute_query(query=query_update) 11547 11548 # Transcripts to info column in FORMAT 11549 if transcripts_info_format: 11550 11551 # Create column on variants table 11552 self.add_column( 11553 table_name=table_variants, 11554 column_name=transcripts_info_format, 11555 column_type="VARCHAR", 11556 default_value=None, 11557 drop=False, 11558 ) 11559 11560 # Add header 11561 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11562 transcripts_info_format, 11563 ".", 11564 "String", 11565 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11566 "unknwon", 11567 "unknwon", 11568 self.code_type_map["String"], 11569 ) 11570 11571 # Add to update 11572 update_set_format.append( 11573 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11574 ) 11575 11576 else: 11577 11578 # Set variable for internal queries 11579 transcripts_info_format = "transcripts_info_format" 11580 11581 # Transcripts to info field in JSON 11582 if transcripts_info_field_format: 11583 11584 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11585 11586 # Add to update 11587 update_set_format.append( 11588 f""" 11589 INFO = concat( 11590 CASE 11591 WHEN INFO NOT IN ('', '.') 11592 THEN INFO 11593 ELSE '' 11594 END, 11595 CASE 11596 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11597 THEN concat( 11598 ';{transcripts_info_field_format}=', 11599 t.{transcripts_info_format} 11600 ) 11601 ELSE '' 11602 END 11603 ) 11604 """ 11605 ) 11606 11607 # Add header 11608 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11609 transcripts_info_field_format, 11610 ".", 11611 "String", 11612 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11613 "unknwon", 11614 "unknwon", 11615 self.code_type_map["String"], 11616 ) 11617 11618 if update_set_format: 11619 11620 # Update query 11621 query_update = f""" 11622 UPDATE {table_variants} 11623 SET {", ".join(update_set_format)} 11624 FROM 11625 ( 11626 SELECT 11627 "#CHROM", POS, REF, ALT, 11628 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11629 FROM 11630 ( 11631 SELECT 11632 "#CHROM", POS, REF, ALT, 11633 "{transcripts_column_id}", 11634 concat( 11635 "{transcripts_column_id}", 11636 '|', 11637 {", '|', ".join(clause_to_format)} 11638 ) AS {transcripts_info_format} 11639 FROM 11640 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11641 ) 11642 GROUP BY "#CHROM", POS, REF, ALT 11643 ) AS t 11644 WHERE {table_variants}."#CHROM" = t."#CHROM" 11645 AND {table_variants}."POS" = t."POS" 11646 AND {table_variants}."REF" = t."REF" 11647 AND {table_variants}."ALT" = t."ALT" 11648 """ 11649 11650 self.execute_query(query=query_update) 11651 11652 return True 11653 11654 def rename_info_fields( 11655 self, fields_to_rename: dict = None, table: str = None 11656 ) -> dict: 11657 """ 11658 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11659 corresponding INFO fields in the variants table. 11660 11661 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11662 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11663 represent the original field names that need to be renamed, and the corresponding values 11664 represent the new names to which the fields should be 11665 :type fields_to_rename: dict 11666 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11667 the table in which the variants data is stored. This table contains information about genetic 11668 variants, and the function updates the corresponding INFO fields in this table when renaming 11669 specified fields in the VCF file header 11670 :type table: str 11671 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11672 the original field names as keys and their corresponding new names (or None if the field was 11673 removed) as values after renaming or removing specified fields in a VCF file header and updating 11674 corresponding INFO fields in the variants table. 11675 """ 11676 11677 # Init 11678 fields_renamed = {} 11679 config = self.get_config() 11680 access = config.get("access") 11681 11682 if table is None: 11683 table = self.get_table_variants() 11684 11685 # regexp replace fonction 11686 regex_replace_dict = {} 11687 regex_replace_nb = 0 11688 regex_replace_partition = 125 11689 regex_replace = "INFO" 11690 11691 if fields_to_rename is not None and access not in ["RO"]: 11692 11693 log.info("Rename or remove fields...") 11694 11695 # Header 11696 header = self.get_header() 11697 11698 for field_to_rename, field_renamed in fields_to_rename.items(): 11699 11700 if field_to_rename in header.infos: 11701 11702 # Rename header 11703 if field_renamed is not None: 11704 header.infos[field_renamed] = vcf.parser._Info( 11705 field_renamed, 11706 header.infos[field_to_rename].num, 11707 header.infos[field_to_rename].type, 11708 header.infos[field_to_rename].desc, 11709 header.infos[field_to_rename].source, 11710 header.infos[field_to_rename].version, 11711 header.infos[field_to_rename].type_code, 11712 ) 11713 del header.infos[field_to_rename] 11714 11715 # Rename INFO patterns 11716 field_pattern = rf'(^|;)({field_to_rename})($|;|=[^;]*)' 11717 if field_renamed is not None: 11718 field_renamed_pattern = rf'\1{field_renamed}\3' 11719 else: 11720 field_renamed_pattern = '' 11721 11722 # regexp replace 11723 regex_replace_nb += 1 11724 regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition) 11725 if (regex_replace_nb % regex_replace_partition) == 0: 11726 regex_replace = "INFO" 11727 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11728 regex_replace_dict[regex_replace_key] = regex_replace 11729 11730 # Return 11731 fields_renamed[field_to_rename] = field_renamed 11732 11733 # Log 11734 if field_renamed is not None: 11735 log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'") 11736 else: 11737 log.info(f"Rename or remove fields - field '{field_to_rename}' removed") 11738 11739 else: 11740 11741 log.warning(f"Rename or remove fields - field '{field_to_rename}' not in header") 11742 11743 11744 # Rename INFO 11745 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11746 log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...") 11747 query = f""" 11748 UPDATE {table} 11749 SET 11750 INFO = {regex_replace} 11751 """ 11752 log.debug(f"query={query}") 11753 self.execute_query(query=query) 11754 11755 return fields_renamed 11756 11757 def calculation_rename_info_fields( 11758 self, 11759 fields_to_rename: dict = None, 11760 table: str = None, 11761 operation_name: str = "RENAME_INFO_FIELDS", 11762 ) -> None: 11763 """ 11764 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11765 fields to rename and table if provided, and then calls another function to rename the fields. 11766 11767 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11768 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11769 the key and the new field name as the value 11770 :type fields_to_rename: dict 11771 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11772 specify the name of the table for which the fields are to be renamed. It is a string type 11773 parameter 11774 :type table: str 11775 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11776 method is a string that specifies the name of the operation being performed. In this context, it 11777 is used as a default value for the operation name if not explicitly provided when calling the 11778 function, defaults to RENAME_INFO_FIELDS 11779 :type operation_name: str (optional) 11780 """ 11781 11782 # Param 11783 param = self.get_param() 11784 11785 # Get param fields to rename 11786 param_fields_to_rename = ( 11787 param.get("calculation", {}) 11788 .get("calculations", {}) 11789 .get(operation_name, {}) 11790 .get("fields_to_rename", None) 11791 ) 11792 11793 # Get param table 11794 param_table = ( 11795 param.get("calculation", {}) 11796 .get("calculations", {}) 11797 .get(operation_name, {}) 11798 .get("table", None) 11799 ) 11800 11801 # Init fields_to_rename 11802 if fields_to_rename is None: 11803 fields_to_rename = param_fields_to_rename 11804 11805 # Init table 11806 if table is None: 11807 table = param_table 11808 11809 renamed_fields = self.rename_info_fields( 11810 fields_to_rename=fields_to_rename, table=table 11811 ) 11812 11813 log.debug(f"renamed_fields:{renamed_fields}")
37class Variants: 38 39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data() 86 87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples 105 106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples 113 114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True) 124 125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "") 150 151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config 164 165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param 175 176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = [] 204 205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False) 213 214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config 255 256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict 280 281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db 310 311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 356 357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None 383 384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None 485 486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df 527 528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None 570 571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats 792 793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file 815 816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None 917 918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input 924 925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format 941 942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed 959 960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output 967 968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format 986 987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config 993 994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param 1000 1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db 1007 1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix 1014 1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants 1042 1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 ) 1054 1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory") 1062 1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn 1070 1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close() 1077 1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required 1097 1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list 1111 1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0 1130 1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return "" 1141 1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return [] 1152 1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list) 1163 1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list 1222 1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False 1241 1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False) 1250 1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format 1262 1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1316 1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes() 1512 1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False) 1522 1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(field) 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return [] 1622 1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix 1641 1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column 1713 1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed 1771 1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns 1988 1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index) 2015 2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index) 2041 2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list 2056 2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f) 2075 2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None 2087 2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 ) 2304 2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns 2336 2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 ) 2351 2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name 2446 2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join( 2519 [f""" "{sample}" """ for sample in list_samples] 2520 ) 2521 else: 2522 samples_fields = "" 2523 log.debug(f"samples_fields: {samples_fields}") 2524 else: 2525 samples_fields = "" 2526 2527 # Where clause 2528 if where_clause is None: 2529 where_clause = "" 2530 2531 # Variants 2532 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2533 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2534 log.debug(f"sql_query_select={sql_query_select}") 2535 2536 return self.export_output( 2537 output_file=vcf_file, 2538 output_header=None, 2539 export_header=True, 2540 query=sql_query_select, 2541 parquet_partitions=None, 2542 chunk_size=config.get("chunk_size", None), 2543 threads=threads, 2544 sort=True, 2545 index=index, 2546 order_by=None, 2547 ) 2548 2549 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2550 """ 2551 It takes a list of commands and runs them in parallel using the number of threads specified 2552 2553 :param commands: A list of commands to run 2554 :param threads: The number of threads to use, defaults to 1 (optional) 2555 """ 2556 2557 run_parallel_commands(commands, threads) 2558 2559 def get_threads(self, default: int = 1) -> int: 2560 """ 2561 This function returns the number of threads to use for a job, with a default value of 1 if not 2562 specified. 2563 2564 :param default: The `default` parameter in the `get_threads` method is used to specify the 2565 default number of threads to use if no specific value is provided. If no value is provided for 2566 the `threads` parameter in the configuration or input parameters, the `default` value will be 2567 used, defaults to 1 2568 :type default: int (optional) 2569 :return: the number of threads to use for the current job. 2570 """ 2571 2572 # Config 2573 config = self.get_config() 2574 2575 # Param 2576 param = self.get_param() 2577 2578 # Input threads 2579 input_thread = param.get("threads", config.get("threads", None)) 2580 2581 # Check threads 2582 if not input_thread: 2583 threads = default 2584 elif int(input_thread) <= 0: 2585 threads = os.cpu_count() 2586 else: 2587 threads = int(input_thread) 2588 return threads 2589 2590 def get_memory(self, default: str = None) -> str: 2591 """ 2592 This function retrieves the memory value from parameters or configuration with a default value 2593 if not found. 2594 2595 :param default: The `get_memory` function takes in a default value as a string parameter. This 2596 default value is used as a fallback in case the `memory` parameter is not provided in the 2597 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2598 the function 2599 :type default: str 2600 :return: The `get_memory` function returns a string value representing the memory parameter. If 2601 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2602 return the default value provided as an argument to the function. 2603 """ 2604 2605 # Config 2606 config = self.get_config() 2607 2608 # Param 2609 param = self.get_param() 2610 2611 # Input threads 2612 input_memory = param.get("memory", config.get("memory", None)) 2613 2614 # Check threads 2615 if input_memory: 2616 memory = input_memory 2617 else: 2618 memory = default 2619 2620 return memory 2621 2622 def update_from_vcf(self, vcf_file: str) -> None: 2623 """ 2624 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2625 2626 :param vcf_file: the path to the VCF file 2627 """ 2628 2629 connexion_format = self.get_connexion_format() 2630 2631 if connexion_format in ["duckdb"]: 2632 self.update_from_vcf_duckdb(vcf_file) 2633 elif connexion_format in ["sqlite"]: 2634 self.update_from_vcf_sqlite(vcf_file) 2635 2636 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2637 """ 2638 It takes a VCF file and updates the INFO column of the variants table in the database with the 2639 INFO column of the VCF file 2640 2641 :param vcf_file: the path to the VCF file 2642 """ 2643 2644 # varaints table 2645 table_variants = self.get_table_variants() 2646 2647 # Loading VCF into temporaire table 2648 skip = self.get_header_length(file=vcf_file) 2649 vcf_df = pd.read_csv( 2650 vcf_file, 2651 sep="\t", 2652 engine="c", 2653 skiprows=skip, 2654 header=0, 2655 low_memory=False, 2656 ) 2657 sql_query_update = f""" 2658 UPDATE {table_variants} as table_variants 2659 SET INFO = concat( 2660 CASE 2661 WHEN INFO NOT IN ('', '.') 2662 THEN INFO 2663 ELSE '' 2664 END, 2665 ( 2666 SELECT 2667 concat( 2668 CASE 2669 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2670 THEN ';' 2671 ELSE '' 2672 END 2673 , 2674 CASE 2675 WHEN table_parquet.INFO NOT IN ('','.') 2676 THEN table_parquet.INFO 2677 ELSE '' 2678 END 2679 ) 2680 FROM vcf_df as table_parquet 2681 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2682 AND table_parquet.\"POS\" = table_variants.\"POS\" 2683 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2684 AND table_parquet.\"REF\" = table_variants.\"REF\" 2685 AND table_parquet.INFO NOT IN ('','.') 2686 ) 2687 ) 2688 ; 2689 """ 2690 self.conn.execute(sql_query_update) 2691 2692 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2693 """ 2694 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2695 table, then updates the INFO column of the variants table with the INFO column of the temporary 2696 table 2697 2698 :param vcf_file: The path to the VCF file you want to update the database with 2699 """ 2700 2701 # Create a temporary table for the VCF 2702 table_vcf = "tmp_vcf" 2703 sql_create = ( 2704 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2705 ) 2706 self.conn.execute(sql_create) 2707 2708 # Loading VCF into temporaire table 2709 vcf_df = pd.read_csv( 2710 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2711 ) 2712 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2713 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2714 2715 # Update table 'variants' with VCF data 2716 # warning: CONCAT as || operator 2717 sql_query_update = f""" 2718 UPDATE variants as table_variants 2719 SET INFO = CASE 2720 WHEN INFO NOT IN ('', '.') 2721 THEN INFO 2722 ELSE '' 2723 END || 2724 ( 2725 SELECT 2726 CASE 2727 WHEN table_variants.INFO NOT IN ('','.') 2728 AND table_vcf.INFO NOT IN ('','.') 2729 THEN ';' 2730 ELSE '' 2731 END || 2732 CASE 2733 WHEN table_vcf.INFO NOT IN ('','.') 2734 THEN table_vcf.INFO 2735 ELSE '' 2736 END 2737 FROM {table_vcf} as table_vcf 2738 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2739 AND table_vcf.\"POS\" = table_variants.\"POS\" 2740 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2741 AND table_vcf.\"REF\" = table_variants.\"REF\" 2742 ) 2743 """ 2744 self.conn.execute(sql_query_update) 2745 2746 # Drop temporary table 2747 sql_drop = f"DROP TABLE {table_vcf}" 2748 self.conn.execute(sql_drop) 2749 2750 def drop_variants_table(self) -> None: 2751 """ 2752 > This function drops the variants table 2753 """ 2754 2755 table_variants = self.get_table_variants() 2756 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2757 self.conn.execute(sql_table_variants) 2758 2759 def set_variant_id( 2760 self, variant_id_column: str = "variant_id", force: bool = None 2761 ) -> str: 2762 """ 2763 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2764 `#CHROM`, `POS`, `REF`, and `ALT` columns 2765 2766 :param variant_id_column: The name of the column to be created in the variants table, defaults 2767 to variant_id 2768 :type variant_id_column: str (optional) 2769 :param force: If True, the variant_id column will be created even if it already exists 2770 :type force: bool 2771 :return: The name of the column that contains the variant_id 2772 """ 2773 2774 # Assembly 2775 assembly = self.get_param().get( 2776 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2777 ) 2778 2779 # INFO/Tag prefix 2780 prefix = self.get_explode_infos_prefix() 2781 2782 # Explode INFO/SVTYPE 2783 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2784 2785 # variants table 2786 table_variants = self.get_table_variants() 2787 2788 # variant_id column 2789 if not variant_id_column: 2790 variant_id_column = "variant_id" 2791 2792 # Creta variant_id column 2793 if "variant_id" not in self.get_extra_infos() or force: 2794 2795 # Create column 2796 self.add_column( 2797 table_name=table_variants, 2798 column_name=variant_id_column, 2799 column_type="UBIGINT", 2800 default_value="0", 2801 ) 2802 2803 # Update column 2804 self.conn.execute( 2805 f""" 2806 UPDATE {table_variants} 2807 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2808 """ 2809 ) 2810 2811 # Remove added columns 2812 for added_column in added_columns: 2813 self.drop_column(column=added_column) 2814 2815 # return variant_id column name 2816 return variant_id_column 2817 2818 def get_variant_id_column( 2819 self, variant_id_column: str = "variant_id", force: bool = None 2820 ) -> str: 2821 """ 2822 This function returns the variant_id column name 2823 2824 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2825 defaults to variant_id 2826 :type variant_id_column: str (optional) 2827 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2828 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2829 if it is not already set, or if it is set 2830 :type force: bool 2831 :return: The variant_id column name. 2832 """ 2833 2834 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2835 2836 ### 2837 # Annotation 2838 ### 2839 2840 def scan_databases( 2841 self, 2842 database_formats: list = ["parquet"], 2843 database_releases: list = ["current"], 2844 ) -> dict: 2845 """ 2846 The function `scan_databases` scans for available databases based on specified formats and 2847 releases. 2848 2849 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2850 of the databases to be scanned. In this case, the accepted format is "parquet" 2851 :type database_formats: list ["parquet"] 2852 :param database_releases: The `database_releases` parameter is a list that specifies the 2853 releases of the databases to be scanned. In the provided function, the default value for 2854 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2855 databases that are in the "current" 2856 :type database_releases: list 2857 :return: The function `scan_databases` returns a dictionary containing information about 2858 databases that match the specified formats and releases. 2859 """ 2860 2861 # Config 2862 config = self.get_config() 2863 2864 # Param 2865 param = self.get_param() 2866 2867 # Param - Assembly 2868 assembly = param.get("assembly", config.get("assembly", None)) 2869 if not assembly: 2870 assembly = DEFAULT_ASSEMBLY 2871 log.warning(f"Default assembly '{assembly}'") 2872 2873 # Scan for availabled databases 2874 log.info( 2875 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2876 ) 2877 databases_infos_dict = databases_infos( 2878 database_folder_releases=database_releases, 2879 database_formats=database_formats, 2880 assembly=assembly, 2881 config=config, 2882 ) 2883 log.info( 2884 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2885 ) 2886 2887 return databases_infos_dict 2888 2889 def annotation(self) -> None: 2890 """ 2891 It annotates the VCF file with the annotations specified in the config file. 2892 """ 2893 2894 # Config 2895 config = self.get_config() 2896 2897 # Param 2898 param = self.get_param() 2899 2900 # Param - Assembly 2901 assembly = param.get("assembly", config.get("assembly", None)) 2902 if not assembly: 2903 assembly = DEFAULT_ASSEMBLY 2904 log.warning(f"Default assembly '{assembly}'") 2905 2906 # annotations databases folders 2907 annotations_databases = set( 2908 config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("parquet", ["~/howard/databases/parquet/current"]) 2914 + config.get("folders", {}) 2915 .get("databases", {}) 2916 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2917 ) 2918 2919 # Get param annotations 2920 if param.get("annotations", None) and isinstance( 2921 param.get("annotations", None), str 2922 ): 2923 log.debug(param.get("annotations", None)) 2924 param_annotation_list = param.get("annotations").split(",") 2925 else: 2926 param_annotation_list = [] 2927 2928 # Each tools param 2929 if param.get("annotation_parquet", None) != None: 2930 log.debug( 2931 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2932 ) 2933 if isinstance(param.get("annotation_parquet", None), list): 2934 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2935 else: 2936 param_annotation_list.append(param.get("annotation_parquet")) 2937 if param.get("annotation_snpsift", None) != None: 2938 if isinstance(param.get("annotation_snpsift", None), list): 2939 param_annotation_list.append( 2940 "snpsift:" 2941 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2942 ) 2943 else: 2944 param_annotation_list.append( 2945 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2946 ) 2947 if param.get("annotation_snpeff", None) != None: 2948 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2949 if param.get("annotation_bcftools", None) != None: 2950 if isinstance(param.get("annotation_bcftools", None), list): 2951 param_annotation_list.append( 2952 "bcftools:" 2953 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2954 ) 2955 else: 2956 param_annotation_list.append( 2957 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2958 ) 2959 if param.get("annotation_annovar", None) != None: 2960 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2961 if param.get("annotation_exomiser", None) != None: 2962 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2963 if param.get("annotation_splice", None) != None: 2964 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2965 2966 # Merge param annotations list 2967 param["annotations"] = ",".join(param_annotation_list) 2968 2969 # debug 2970 log.debug(f"param_annotations={param['annotations']}") 2971 2972 if param.get("annotations"): 2973 2974 # Log 2975 # log.info("Annotations - Check annotation parameters") 2976 2977 if not "annotation" in param: 2978 param["annotation"] = {} 2979 2980 # List of annotations parameters 2981 annotations_list_input = {} 2982 if isinstance(param.get("annotations", None), str): 2983 annotation_file_list = [ 2984 value for value in param.get("annotations", "").split(",") 2985 ] 2986 for annotation_file in annotation_file_list: 2987 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2988 else: 2989 annotations_list_input = param.get("annotations", {}) 2990 2991 log.info(f"Quick Annotations:") 2992 for annotation_key in list(annotations_list_input.keys()): 2993 log.info(f" {annotation_key}") 2994 2995 # List of annotations and associated fields 2996 annotations_list = {} 2997 2998 for annotation_file in annotations_list_input: 2999 3000 # Explode annotations if ALL 3001 if ( 3002 annotation_file.upper() == "ALL" 3003 or annotation_file.upper().startswith("ALL:") 3004 ): 3005 3006 # check ALL parameters (formats, releases) 3007 annotation_file_split = annotation_file.split(":") 3008 database_formats = "parquet" 3009 database_releases = "current" 3010 for annotation_file_option in annotation_file_split[1:]: 3011 database_all_options_split = annotation_file_option.split("=") 3012 if database_all_options_split[0] == "format": 3013 database_formats = database_all_options_split[1].split("+") 3014 if database_all_options_split[0] == "release": 3015 database_releases = database_all_options_split[1].split("+") 3016 3017 # Scan for availabled databases 3018 databases_infos_dict = self.scan_databases( 3019 database_formats=database_formats, 3020 database_releases=database_releases, 3021 ) 3022 3023 # Add found databases in annotation parameters 3024 for database_infos in databases_infos_dict.keys(): 3025 annotations_list[database_infos] = {"INFO": None} 3026 3027 else: 3028 annotations_list[annotation_file] = annotations_list_input[ 3029 annotation_file 3030 ] 3031 3032 # Check each databases 3033 if len(annotations_list): 3034 3035 log.info( 3036 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3037 ) 3038 3039 for annotation_file in annotations_list: 3040 3041 # Init 3042 annotations = annotations_list.get(annotation_file, None) 3043 3044 # Annotation snpEff 3045 if annotation_file.startswith("snpeff"): 3046 3047 log.debug(f"Quick Annotation snpEff") 3048 3049 if "snpeff" not in param["annotation"]: 3050 param["annotation"]["snpeff"] = {} 3051 3052 if "options" not in param["annotation"]["snpeff"]: 3053 param["annotation"]["snpeff"]["options"] = "" 3054 3055 # snpEff options in annotations 3056 param["annotation"]["snpeff"]["options"] = "".join( 3057 annotation_file.split(":")[1:] 3058 ) 3059 3060 # Annotation Annovar 3061 elif annotation_file.startswith("annovar"): 3062 3063 log.debug(f"Quick Annotation Annovar") 3064 3065 if "annovar" not in param["annotation"]: 3066 param["annotation"]["annovar"] = {} 3067 3068 if "annotations" not in param["annotation"]["annovar"]: 3069 param["annotation"]["annovar"]["annotations"] = {} 3070 3071 # Options 3072 annotation_file_split = annotation_file.split(":") 3073 for annotation_file_annotation in annotation_file_split[1:]: 3074 if annotation_file_annotation: 3075 param["annotation"]["annovar"]["annotations"][ 3076 annotation_file_annotation 3077 ] = annotations 3078 3079 # Annotation Exomiser 3080 elif annotation_file.startswith("exomiser"): 3081 3082 log.debug(f"Quick Annotation Exomiser") 3083 3084 param["annotation"]["exomiser"] = params_string_to_dict( 3085 annotation_file 3086 ) 3087 3088 # Annotation Splice 3089 elif annotation_file.startswith("splice"): 3090 3091 log.debug(f"Quick Annotation Splice") 3092 3093 param["annotation"]["splice"] = params_string_to_dict( 3094 annotation_file 3095 ) 3096 3097 # Annotation Parquet or BCFTOOLS 3098 else: 3099 3100 # Tools detection 3101 if annotation_file.startswith("bcftools:"): 3102 annotation_tool_initial = "bcftools" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("snpsift:"): 3105 annotation_tool_initial = "snpsift" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 elif annotation_file.startswith("bigwig:"): 3108 annotation_tool_initial = "bigwig" 3109 annotation_file = ":".join(annotation_file.split(":")[1:]) 3110 else: 3111 annotation_tool_initial = None 3112 3113 # list of files 3114 annotation_file_list = annotation_file.replace("+", ":").split( 3115 ":" 3116 ) 3117 3118 for annotation_file in annotation_file_list: 3119 3120 if annotation_file: 3121 3122 # Annotation tool initial 3123 annotation_tool = annotation_tool_initial 3124 3125 # Find file 3126 annotation_file_found = None 3127 3128 if os.path.exists(annotation_file): 3129 annotation_file_found = annotation_file 3130 elif os.path.exists(full_path(annotation_file)): 3131 annotation_file_found = full_path(annotation_file) 3132 else: 3133 # Find within assembly folders 3134 for annotations_database in annotations_databases: 3135 found_files = find_all( 3136 annotation_file, 3137 os.path.join( 3138 annotations_database, assembly 3139 ), 3140 ) 3141 if len(found_files) > 0: 3142 annotation_file_found = found_files[0] 3143 break 3144 if not annotation_file_found and not assembly: 3145 # Find within folders 3146 for ( 3147 annotations_database 3148 ) in annotations_databases: 3149 found_files = find_all( 3150 annotation_file, annotations_database 3151 ) 3152 if len(found_files) > 0: 3153 annotation_file_found = found_files[0] 3154 break 3155 log.debug( 3156 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3157 ) 3158 3159 # Full path 3160 annotation_file_found = full_path(annotation_file_found) 3161 3162 if annotation_file_found: 3163 3164 database = Database(database=annotation_file_found) 3165 quick_annotation_format = database.get_format() 3166 quick_annotation_is_compressed = ( 3167 database.is_compressed() 3168 ) 3169 quick_annotation_is_indexed = os.path.exists( 3170 f"{annotation_file_found}.tbi" 3171 ) 3172 bcftools_preference = False 3173 3174 # Check Annotation Tool 3175 if not annotation_tool: 3176 if ( 3177 bcftools_preference 3178 and quick_annotation_format 3179 in ["vcf", "bed"] 3180 and quick_annotation_is_compressed 3181 and quick_annotation_is_indexed 3182 ): 3183 annotation_tool = "bcftools" 3184 elif quick_annotation_format in [ 3185 "vcf", 3186 "bed", 3187 "tsv", 3188 "tsv", 3189 "csv", 3190 "json", 3191 "tbl", 3192 "parquet", 3193 "duckdb", 3194 ]: 3195 annotation_tool = "parquet" 3196 elif quick_annotation_format in ["bw"]: 3197 annotation_tool = "bigwig" 3198 else: 3199 log.error( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 raise ValueError( 3203 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3204 ) 3205 3206 log.debug( 3207 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3208 ) 3209 3210 # Annotation Tool dispatch 3211 if annotation_tool: 3212 if annotation_tool not in param["annotation"]: 3213 param["annotation"][annotation_tool] = {} 3214 if ( 3215 "annotations" 3216 not in param["annotation"][annotation_tool] 3217 ): 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ] = {} 3221 param["annotation"][annotation_tool][ 3222 "annotations" 3223 ][annotation_file_found] = annotations 3224 3225 else: 3226 log.warning( 3227 f"Quick Annotation File {annotation_file} does NOT exist" 3228 ) 3229 3230 self.set_param(param) 3231 3232 if param.get("annotation", None): 3233 log.info("Annotations") 3234 if param.get("annotation", {}).get("parquet", None): 3235 log.info("Annotations 'parquet'...") 3236 self.annotation_parquet() 3237 if param.get("annotation", {}).get("bcftools", None): 3238 log.info("Annotations 'bcftools'...") 3239 self.annotation_bcftools() 3240 if param.get("annotation", {}).get("snpsift", None): 3241 log.info("Annotations 'snpsift'...") 3242 self.annotation_snpsift() 3243 if param.get("annotation", {}).get("bigwig", None): 3244 log.info("Annotations 'bigwig'...") 3245 self.annotation_bigwig() 3246 if param.get("annotation", {}).get("annovar", None): 3247 log.info("Annotations 'annovar'...") 3248 self.annotation_annovar() 3249 if param.get("annotation", {}).get("snpeff", None): 3250 log.info("Annotations 'snpeff'...") 3251 self.annotation_snpeff() 3252 if param.get("annotation", {}).get("exomiser", None) is not None: 3253 log.info("Annotations 'exomiser'...") 3254 self.annotation_exomiser() 3255 if param.get("annotation", {}).get("splice", None) is not None: 3256 log.info("Annotations 'splice' ...") 3257 self.annotation_splice() 3258 3259 # Explode INFOS fields into table fields 3260 if self.get_explode_infos(): 3261 self.explode_infos( 3262 prefix=self.get_explode_infos_prefix(), 3263 fields=self.get_explode_infos_fields(), 3264 force=True, 3265 ) 3266 3267 def annotation_bigwig(self, threads: int = None) -> None: 3268 """ 3269 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3270 3271 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3272 number of threads to be used for parallel processing during the annotation process. If the 3273 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3274 threads to use based on the system configuration 3275 :type threads: int 3276 :return: True 3277 """ 3278 3279 # DEBUG 3280 log.debug("Start annotation with bigwig databases") 3281 3282 # # Threads 3283 # if not threads: 3284 # threads = self.get_threads() 3285 # log.debug("Threads: " + str(threads)) 3286 3287 # Config 3288 config = self.get_config() 3289 log.debug("Config: " + str(config)) 3290 3291 # Config - BCFTools databases folders 3292 databases_folders = set( 3293 self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("annotations", ["."]) 3297 + self.get_config() 3298 .get("folders", {}) 3299 .get("databases", {}) 3300 .get("bigwig", ["."]) 3301 ) 3302 log.debug("Databases annotations: " + str(databases_folders)) 3303 3304 # Param 3305 annotations = ( 3306 self.get_param() 3307 .get("annotation", {}) 3308 .get("bigwig", {}) 3309 .get("annotations", None) 3310 ) 3311 log.debug("Annotations: " + str(annotations)) 3312 3313 # Assembly 3314 assembly = self.get_param().get( 3315 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3316 ) 3317 3318 # Data 3319 table_variants = self.get_table_variants() 3320 3321 # Check if not empty 3322 log.debug("Check if not empty") 3323 sql_query_chromosomes = ( 3324 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3325 ) 3326 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3327 if not sql_query_chromosomes_df["count"][0]: 3328 log.info(f"VCF empty") 3329 return 3330 3331 # VCF header 3332 vcf_reader = self.get_header() 3333 log.debug("Initial header: " + str(vcf_reader.infos)) 3334 3335 # Existing annotations 3336 for vcf_annotation in self.get_header().infos: 3337 3338 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3339 log.debug( 3340 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3341 ) 3342 3343 if annotations: 3344 3345 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3346 3347 # Export VCF file 3348 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3349 3350 # annotation_bigwig_config 3351 annotation_bigwig_config_list = [] 3352 3353 for annotation in annotations: 3354 annotation_fields = annotations[annotation] 3355 3356 # Annotation Name 3357 annotation_name = os.path.basename(annotation) 3358 3359 if not annotation_fields: 3360 annotation_fields = {"INFO": None} 3361 3362 log.debug(f"Annotation '{annotation_name}'") 3363 log.debug( 3364 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3365 ) 3366 3367 # Create Database 3368 database = Database( 3369 database=annotation, 3370 databases_folders=databases_folders, 3371 assembly=assembly, 3372 ) 3373 3374 # Find files 3375 db_file = database.get_database() 3376 db_file = full_path(db_file) 3377 db_hdr_file = database.get_header_file() 3378 db_hdr_file = full_path(db_hdr_file) 3379 db_file_type = database.get_format() 3380 3381 # If db_file is http ? 3382 if database.get_database().startswith("http"): 3383 3384 # Datbase is HTTP URL 3385 db_file_is_http = True 3386 3387 # DB file keep as URL 3388 db_file = database.get_database() 3389 log.warning( 3390 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3391 ) 3392 3393 # Retrieve automatic annotation field name 3394 annotation_field = clean_annotation_field( 3395 os.path.basename(db_file).replace(".bw", "") 3396 ) 3397 log.debug( 3398 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3399 ) 3400 3401 # Create automatic header file 3402 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3403 with open(db_hdr_file, "w") as f: 3404 f.write("##fileformat=VCFv4.2\n") 3405 f.write( 3406 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3407 ) 3408 f.write(f"#CHROM START END {annotation_field}\n") 3409 3410 else: 3411 3412 # Datbase is NOT HTTP URL 3413 db_file_is_http = False 3414 3415 # Check index - try to create if not exists 3416 if ( 3417 db_file is None 3418 or db_hdr_file is None 3419 or (not os.path.exists(db_file) and not db_file_is_http) 3420 or not os.path.exists(db_hdr_file) 3421 or not db_file_type in ["bw"] 3422 ): 3423 # if False: 3424 log.error("Annotation failed: database not valid") 3425 log.error(f"Annotation annotation file: {db_file}") 3426 log.error(f"Annotation annotation file type: {db_file_type}") 3427 log.error(f"Annotation annotation header: {db_hdr_file}") 3428 raise ValueError( 3429 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3430 ) 3431 else: 3432 3433 # Log 3434 log.debug( 3435 f"Annotation '{annotation}' - file: " 3436 + str(db_file) 3437 + " and " 3438 + str(db_hdr_file) 3439 ) 3440 3441 # Load header as VCF object 3442 db_hdr_vcf = Variants(input=db_hdr_file) 3443 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3444 log.debug( 3445 "Annotation database header: " 3446 + str(db_hdr_vcf_header_infos) 3447 ) 3448 3449 # For all fields in database 3450 annotation_fields_full = False 3451 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3452 annotation_fields = { 3453 key: key for key in db_hdr_vcf_header_infos 3454 } 3455 log.debug( 3456 "Annotation database header - All annotations added: " 3457 + str(annotation_fields) 3458 ) 3459 annotation_fields_full = True 3460 3461 # Init 3462 cyvcf2_header_rename_dict = {} 3463 cyvcf2_header_list = [] 3464 cyvcf2_header_indexes = {} 3465 3466 # process annotation fields 3467 for annotation_field in annotation_fields: 3468 3469 # New annotation name 3470 annotation_field_new = annotation_fields[annotation_field] 3471 3472 # Check annotation field and index in header 3473 if ( 3474 annotation_field 3475 in db_hdr_vcf.get_header_columns_as_list() 3476 ): 3477 annotation_field_index = ( 3478 db_hdr_vcf.get_header_columns_as_list().index( 3479 annotation_field 3480 ) 3481 - 3 3482 ) 3483 cyvcf2_header_indexes[annotation_field_new] = ( 3484 annotation_field_index 3485 ) 3486 else: 3487 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3488 log.error(msg_err) 3489 raise ValueError(msg_err) 3490 3491 # Append annotation field in cyvcf2 header list 3492 cyvcf2_header_rename_dict[annotation_field_new] = ( 3493 db_hdr_vcf_header_infos[annotation_field].id 3494 ) 3495 cyvcf2_header_list.append( 3496 { 3497 "ID": annotation_field_new, 3498 "Number": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].num, 3501 "Type": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].type, 3504 "Description": db_hdr_vcf_header_infos[ 3505 annotation_field 3506 ].desc, 3507 } 3508 ) 3509 3510 # Add header on VCF 3511 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3512 annotation_field_new, 3513 db_hdr_vcf_header_infos[annotation_field].num, 3514 db_hdr_vcf_header_infos[annotation_field].type, 3515 db_hdr_vcf_header_infos[annotation_field].desc, 3516 "HOWARD BigWig annotation", 3517 "unknown", 3518 self.code_type_map[ 3519 db_hdr_vcf_header_infos[annotation_field].type 3520 ], 3521 ) 3522 3523 # Load bigwig database 3524 bw_db = pyBigWig.open(db_file) 3525 if bw_db.isBigWig(): 3526 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3527 else: 3528 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3529 log.error(msg_err) 3530 raise ValueError(msg_err) 3531 3532 annotation_bigwig_config_list.append( 3533 { 3534 "db_file": db_file, 3535 "bw_db": bw_db, 3536 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3537 "cyvcf2_header_list": cyvcf2_header_list, 3538 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3539 } 3540 ) 3541 3542 # Annotate 3543 if annotation_bigwig_config_list: 3544 3545 # Annotation config 3546 log.debug( 3547 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3548 ) 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 3558 # Load input tmp file 3559 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3560 3561 # Add header in input file 3562 for annotation_bigwig_config in annotation_bigwig_config_list: 3563 for cyvcf2_header_field in annotation_bigwig_config.get( 3564 "cyvcf2_header_list", [] 3565 ): 3566 log.info( 3567 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3568 ) 3569 input_vcf.add_info_to_header(cyvcf2_header_field) 3570 3571 # Create output VCF file 3572 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3573 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3574 3575 # Fetch variants 3576 log.info(f"Annotations 'bigwig' start...") 3577 for variant in input_vcf: 3578 3579 for annotation_bigwig_config in annotation_bigwig_config_list: 3580 3581 # DB and indexes 3582 bw_db = annotation_bigwig_config.get("bw_db", None) 3583 cyvcf2_header_indexes = annotation_bigwig_config.get( 3584 "cyvcf2_header_indexes", None 3585 ) 3586 3587 # Retrieve value from chrom pos 3588 res = bw_db.values( 3589 variant.CHROM, variant.POS - 1, variant.POS 3590 ) 3591 3592 # For each annotation fields (and indexes) 3593 for cyvcf2_header_index in cyvcf2_header_indexes: 3594 3595 # If value is NOT nNone 3596 if not np.isnan( 3597 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3598 ): 3599 variant.INFO[cyvcf2_header_index] = res[ 3600 cyvcf2_header_indexes[cyvcf2_header_index] 3601 ] 3602 3603 # Add record in output file 3604 output_vcf.write_record(variant) 3605 3606 # Log 3607 log.debug(f"Annotation done.") 3608 3609 # Close and write file 3610 log.info(f"Annotations 'bigwig' write...") 3611 output_vcf.close() 3612 log.debug(f"Write done.") 3613 3614 # Update variants 3615 log.info(f"Annotations 'bigwig' update...") 3616 self.update_from_vcf(output_vcf_file) 3617 log.debug(f"Update done.") 3618 3619 return True 3620 3621 def annotation_snpsift(self, threads: int = None) -> None: 3622 """ 3623 This function annotate with bcftools 3624 3625 :param threads: Number of threads to use 3626 :return: the value of the variable "return_value". 3627 """ 3628 3629 # DEBUG 3630 log.debug("Start annotation with bcftools databases") 3631 3632 # Threads 3633 if not threads: 3634 threads = self.get_threads() 3635 log.debug("Threads: " + str(threads)) 3636 3637 # Config 3638 config = self.get_config() 3639 log.debug("Config: " + str(config)) 3640 3641 # Config - snpSift 3642 snpsift_bin_command = get_bin_command( 3643 bin="SnpSift.jar", 3644 tool="snpsift", 3645 bin_type="jar", 3646 config=config, 3647 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3648 ) 3649 if not snpsift_bin_command: 3650 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3651 log.error(msg_err) 3652 raise ValueError(msg_err) 3653 3654 # Config - bcftools 3655 bcftools_bin_command = get_bin_command( 3656 bin="bcftools", 3657 tool="bcftools", 3658 bin_type="bin", 3659 config=config, 3660 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3661 ) 3662 if not bcftools_bin_command: 3663 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3664 log.error(msg_err) 3665 raise ValueError(msg_err) 3666 3667 # Config - BCFTools databases folders 3668 databases_folders = set( 3669 self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("annotations", ["."]) 3673 + self.get_config() 3674 .get("folders", {}) 3675 .get("databases", {}) 3676 .get("bcftools", ["."]) 3677 ) 3678 log.debug("Databases annotations: " + str(databases_folders)) 3679 3680 # Param 3681 annotations = ( 3682 self.get_param() 3683 .get("annotation", {}) 3684 .get("snpsift", {}) 3685 .get("annotations", None) 3686 ) 3687 log.debug("Annotations: " + str(annotations)) 3688 3689 # Assembly 3690 assembly = self.get_param().get( 3691 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3692 ) 3693 3694 # Data 3695 table_variants = self.get_table_variants() 3696 3697 # Check if not empty 3698 log.debug("Check if not empty") 3699 sql_query_chromosomes = ( 3700 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3701 ) 3702 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3703 if not sql_query_chromosomes_df["count"][0]: 3704 log.info(f"VCF empty") 3705 return 3706 3707 # VCF header 3708 vcf_reader = self.get_header() 3709 log.debug("Initial header: " + str(vcf_reader.infos)) 3710 3711 # Existing annotations 3712 for vcf_annotation in self.get_header().infos: 3713 3714 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3715 log.debug( 3716 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3717 ) 3718 3719 if annotations: 3720 3721 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3722 3723 # Export VCF file 3724 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3725 3726 # Init 3727 commands = {} 3728 3729 for annotation in annotations: 3730 annotation_fields = annotations[annotation] 3731 3732 # Annotation Name 3733 annotation_name = os.path.basename(annotation) 3734 3735 if not annotation_fields: 3736 annotation_fields = {"INFO": None} 3737 3738 log.debug(f"Annotation '{annotation_name}'") 3739 log.debug( 3740 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3741 ) 3742 3743 # Create Database 3744 database = Database( 3745 database=annotation, 3746 databases_folders=databases_folders, 3747 assembly=assembly, 3748 ) 3749 3750 # Find files 3751 db_file = database.get_database() 3752 db_file = full_path(db_file) 3753 db_hdr_file = database.get_header_file() 3754 db_hdr_file = full_path(db_hdr_file) 3755 db_file_type = database.get_format() 3756 db_tbi_file = f"{db_file}.tbi" 3757 db_file_compressed = database.is_compressed() 3758 3759 # Check if compressed 3760 if not db_file_compressed: 3761 log.error( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 raise ValueError( 3765 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3766 ) 3767 3768 # Check if indexed 3769 if not os.path.exists(db_tbi_file): 3770 log.error( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 raise ValueError( 3774 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3775 ) 3776 3777 # Check index - try to create if not exists 3778 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3779 log.error("Annotation failed: database not valid") 3780 log.error(f"Annotation annotation file: {db_file}") 3781 log.error(f"Annotation annotation header: {db_hdr_file}") 3782 log.error(f"Annotation annotation index: {db_tbi_file}") 3783 raise ValueError( 3784 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3785 ) 3786 else: 3787 3788 log.debug( 3789 f"Annotation '{annotation}' - file: " 3790 + str(db_file) 3791 + " and " 3792 + str(db_hdr_file) 3793 ) 3794 3795 # Load header as VCF object 3796 db_hdr_vcf = Variants(input=db_hdr_file) 3797 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3798 log.debug( 3799 "Annotation database header: " 3800 + str(db_hdr_vcf_header_infos) 3801 ) 3802 3803 # For all fields in database 3804 annotation_fields_full = False 3805 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3806 annotation_fields = { 3807 key: key for key in db_hdr_vcf_header_infos 3808 } 3809 log.debug( 3810 "Annotation database header - All annotations added: " 3811 + str(annotation_fields) 3812 ) 3813 annotation_fields_full = True 3814 3815 # # Create file for field rename 3816 # log.debug("Create file for field rename") 3817 # tmp_rename = NamedTemporaryFile( 3818 # prefix=self.get_prefix(), 3819 # dir=self.get_tmp_dir(), 3820 # suffix=".rename", 3821 # delete=False, 3822 # ) 3823 # tmp_rename_name = tmp_rename.name 3824 # tmp_files.append(tmp_rename_name) 3825 3826 # Number of fields 3827 nb_annotation_field = 0 3828 annotation_list = [] 3829 annotation_infos_rename_list = [] 3830 3831 for annotation_field in annotation_fields: 3832 3833 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3834 annotation_fields_new_name = annotation_fields.get( 3835 annotation_field, annotation_field 3836 ) 3837 if not annotation_fields_new_name: 3838 annotation_fields_new_name = annotation_field 3839 3840 # Check if field is in DB and if field is not elready in input data 3841 if ( 3842 annotation_field in db_hdr_vcf.get_header().infos 3843 and annotation_fields_new_name 3844 not in self.get_header().infos 3845 ): 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3849 ) 3850 3851 # BCFTools annotate param to rename fields 3852 if annotation_field != annotation_fields_new_name: 3853 annotation_infos_rename_list.append( 3854 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3855 ) 3856 3857 # Add INFO field to header 3858 db_hdr_vcf_header_infos_number = ( 3859 db_hdr_vcf_header_infos[annotation_field].num or "." 3860 ) 3861 db_hdr_vcf_header_infos_type = ( 3862 db_hdr_vcf_header_infos[annotation_field].type 3863 or "String" 3864 ) 3865 db_hdr_vcf_header_infos_description = ( 3866 db_hdr_vcf_header_infos[annotation_field].desc 3867 or f"{annotation_field} description" 3868 ) 3869 db_hdr_vcf_header_infos_source = ( 3870 db_hdr_vcf_header_infos[annotation_field].source 3871 or "unknown" 3872 ) 3873 db_hdr_vcf_header_infos_version = ( 3874 db_hdr_vcf_header_infos[annotation_field].version 3875 or "unknown" 3876 ) 3877 3878 vcf_reader.infos[annotation_fields_new_name] = ( 3879 vcf.parser._Info( 3880 annotation_fields_new_name, 3881 db_hdr_vcf_header_infos_number, 3882 db_hdr_vcf_header_infos_type, 3883 db_hdr_vcf_header_infos_description, 3884 db_hdr_vcf_header_infos_source, 3885 db_hdr_vcf_header_infos_version, 3886 self.code_type_map[ 3887 db_hdr_vcf_header_infos_type 3888 ], 3889 ) 3890 ) 3891 3892 annotation_list.append(annotation_field) 3893 3894 nb_annotation_field += 1 3895 3896 else: 3897 3898 if ( 3899 annotation_field 3900 not in db_hdr_vcf.get_header().infos 3901 ): 3902 log.warning( 3903 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3904 ) 3905 if ( 3906 annotation_fields_new_name 3907 in self.get_header().infos 3908 ): 3909 log.warning( 3910 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3911 ) 3912 3913 log.info( 3914 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3915 ) 3916 3917 annotation_infos = ",".join(annotation_list) 3918 3919 if annotation_infos != "": 3920 3921 # Annotated VCF (and error file) 3922 tmp_annotation_vcf_name = os.path.join( 3923 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3924 ) 3925 tmp_annotation_vcf_name_err = ( 3926 tmp_annotation_vcf_name + ".err" 3927 ) 3928 3929 # Add fields to annotate 3930 if not annotation_fields_full: 3931 annotation_infos_option = f"-info {annotation_infos}" 3932 else: 3933 annotation_infos_option = "" 3934 3935 # Info fields rename 3936 if annotation_infos_rename_list: 3937 annotation_infos_rename = " -c " + ",".join( 3938 annotation_infos_rename_list 3939 ) 3940 else: 3941 annotation_infos_rename = "" 3942 3943 # Annotate command 3944 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3945 3946 # Add command 3947 commands[command_annotate] = tmp_annotation_vcf_name 3948 3949 if commands: 3950 3951 # Export VCF file 3952 self.export_variant_vcf( 3953 vcf_file=tmp_vcf_name, 3954 remove_info=True, 3955 add_samples=False, 3956 index=True, 3957 ) 3958 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3959 3960 # Num command 3961 nb_command = 0 3962 3963 # Annotate 3964 for command_annotate in commands: 3965 nb_command += 1 3966 log.info( 3967 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3968 ) 3969 log.debug(f"command_annotate={command_annotate}") 3970 run_parallel_commands([command_annotate], threads) 3971 3972 # Debug 3973 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3974 3975 # Update variants 3976 log.info( 3977 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3978 ) 3979 self.update_from_vcf(commands[command_annotate]) 3980 3981 def annotation_bcftools(self, threads: int = None) -> None: 3982 """ 3983 This function annotate with bcftools 3984 3985 :param threads: Number of threads to use 3986 :return: the value of the variable "return_value". 3987 """ 3988 3989 # DEBUG 3990 log.debug("Start annotation with bcftools databases") 3991 3992 # Threads 3993 if not threads: 3994 threads = self.get_threads() 3995 log.debug("Threads: " + str(threads)) 3996 3997 # Config 3998 config = self.get_config() 3999 log.debug("Config: " + str(config)) 4000 4001 # DEBUG 4002 delete_tmp = True 4003 if self.get_config().get("verbosity", "warning") in ["debug"]: 4004 delete_tmp = False 4005 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4006 4007 # Config - BCFTools bin command 4008 bcftools_bin_command = get_bin_command( 4009 bin="bcftools", 4010 tool="bcftools", 4011 bin_type="bin", 4012 config=config, 4013 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4014 ) 4015 if not bcftools_bin_command: 4016 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4017 log.error(msg_err) 4018 raise ValueError(msg_err) 4019 4020 # Config - BCFTools databases folders 4021 databases_folders = set( 4022 self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("annotations", ["."]) 4026 + self.get_config() 4027 .get("folders", {}) 4028 .get("databases", {}) 4029 .get("bcftools", ["."]) 4030 ) 4031 log.debug("Databases annotations: " + str(databases_folders)) 4032 4033 # Param 4034 annotations = ( 4035 self.get_param() 4036 .get("annotation", {}) 4037 .get("bcftools", {}) 4038 .get("annotations", None) 4039 ) 4040 log.debug("Annotations: " + str(annotations)) 4041 4042 # Assembly 4043 assembly = self.get_param().get( 4044 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4045 ) 4046 4047 # Data 4048 table_variants = self.get_table_variants() 4049 4050 # Check if not empty 4051 log.debug("Check if not empty") 4052 sql_query_chromosomes = ( 4053 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4054 ) 4055 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4056 if not sql_query_chromosomes_df["count"][0]: 4057 log.info(f"VCF empty") 4058 return 4059 4060 # Export in VCF 4061 log.debug("Create initial file to annotate") 4062 tmp_vcf = NamedTemporaryFile( 4063 prefix=self.get_prefix(), 4064 dir=self.get_tmp_dir(), 4065 suffix=".vcf.gz", 4066 delete=False, 4067 ) 4068 tmp_vcf_name = tmp_vcf.name 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Existing annotations 4075 for vcf_annotation in self.get_header().infos: 4076 4077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4078 log.debug( 4079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4080 ) 4081 4082 if annotations: 4083 4084 tmp_ann_vcf_list = [] 4085 commands = [] 4086 tmp_files = [] 4087 err_files = [] 4088 4089 for annotation in annotations: 4090 annotation_fields = annotations[annotation] 4091 4092 # Annotation Name 4093 annotation_name = os.path.basename(annotation) 4094 4095 if not annotation_fields: 4096 annotation_fields = {"INFO": None} 4097 4098 log.debug(f"Annotation '{annotation_name}'") 4099 log.debug( 4100 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4101 ) 4102 4103 # Create Database 4104 database = Database( 4105 database=annotation, 4106 databases_folders=databases_folders, 4107 assembly=assembly, 4108 ) 4109 4110 # Find files 4111 db_file = database.get_database() 4112 db_file = full_path(db_file) 4113 db_hdr_file = database.get_header_file() 4114 db_hdr_file = full_path(db_hdr_file) 4115 db_file_type = database.get_format() 4116 db_tbi_file = f"{db_file}.tbi" 4117 db_file_compressed = database.is_compressed() 4118 4119 # Check if compressed 4120 if not db_file_compressed: 4121 log.error( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 raise ValueError( 4125 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4126 ) 4127 4128 # Check if indexed 4129 if not os.path.exists(db_tbi_file): 4130 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4131 raise ValueError( 4132 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4133 ) 4134 4135 # Check index - try to create if not exists 4136 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4137 log.error("Annotation failed: database not valid") 4138 log.error(f"Annotation annotation file: {db_file}") 4139 log.error(f"Annotation annotation header: {db_hdr_file}") 4140 log.error(f"Annotation annotation index: {db_tbi_file}") 4141 raise ValueError( 4142 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4143 ) 4144 else: 4145 4146 log.debug( 4147 f"Annotation '{annotation}' - file: " 4148 + str(db_file) 4149 + " and " 4150 + str(db_hdr_file) 4151 ) 4152 4153 # Load header as VCF object 4154 db_hdr_vcf = Variants(input=db_hdr_file) 4155 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4156 log.debug( 4157 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4158 ) 4159 4160 # For all fields in database 4161 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4162 annotation_fields = { 4163 key: key for key in db_hdr_vcf_header_infos 4164 } 4165 log.debug( 4166 "Annotation database header - All annotations added: " 4167 + str(annotation_fields) 4168 ) 4169 4170 # Number of fields 4171 nb_annotation_field = 0 4172 annotation_list = [] 4173 4174 for annotation_field in annotation_fields: 4175 4176 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4177 annotation_fields_new_name = annotation_fields.get( 4178 annotation_field, annotation_field 4179 ) 4180 if not annotation_fields_new_name: 4181 annotation_fields_new_name = annotation_field 4182 4183 # Check if field is in DB and if field is not elready in input data 4184 if ( 4185 annotation_field in db_hdr_vcf.get_header().infos 4186 and annotation_fields_new_name 4187 not in self.get_header().infos 4188 ): 4189 4190 log.info( 4191 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4192 ) 4193 4194 # Add INFO field to header 4195 db_hdr_vcf_header_infos_number = ( 4196 db_hdr_vcf_header_infos[annotation_field].num or "." 4197 ) 4198 db_hdr_vcf_header_infos_type = ( 4199 db_hdr_vcf_header_infos[annotation_field].type 4200 or "String" 4201 ) 4202 db_hdr_vcf_header_infos_description = ( 4203 db_hdr_vcf_header_infos[annotation_field].desc 4204 or f"{annotation_field} description" 4205 ) 4206 db_hdr_vcf_header_infos_source = ( 4207 db_hdr_vcf_header_infos[annotation_field].source 4208 or "unknown" 4209 ) 4210 db_hdr_vcf_header_infos_version = ( 4211 db_hdr_vcf_header_infos[annotation_field].version 4212 or "unknown" 4213 ) 4214 4215 vcf_reader.infos[annotation_fields_new_name] = ( 4216 vcf.parser._Info( 4217 annotation_fields_new_name, 4218 db_hdr_vcf_header_infos_number, 4219 db_hdr_vcf_header_infos_type, 4220 db_hdr_vcf_header_infos_description, 4221 db_hdr_vcf_header_infos_source, 4222 db_hdr_vcf_header_infos_version, 4223 self.code_type_map[db_hdr_vcf_header_infos_type], 4224 ) 4225 ) 4226 4227 # annotation_list.append(annotation_field) 4228 if annotation_field != annotation_fields_new_name: 4229 annotation_list.append( 4230 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4231 ) 4232 else: 4233 annotation_list.append(annotation_field) 4234 4235 nb_annotation_field += 1 4236 4237 else: 4238 4239 if annotation_field not in db_hdr_vcf.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4242 ) 4243 if annotation_fields_new_name in self.get_header().infos: 4244 log.warning( 4245 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4246 ) 4247 4248 log.info( 4249 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4250 ) 4251 4252 annotation_infos = ",".join(annotation_list) 4253 4254 if annotation_infos != "": 4255 4256 # Protect header for bcftools (remove "#CHROM" and variants line) 4257 log.debug("Protect Header file - remove #CHROM line if exists") 4258 tmp_header_vcf = NamedTemporaryFile( 4259 prefix=self.get_prefix(), 4260 dir=self.get_tmp_dir(), 4261 suffix=".hdr", 4262 delete=False, 4263 ) 4264 tmp_header_vcf_name = tmp_header_vcf.name 4265 tmp_files.append(tmp_header_vcf_name) 4266 # Command 4267 if db_hdr_file.endswith(".gz"): 4268 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 else: 4270 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4271 # Run 4272 run_parallel_commands([command_extract_header], 1) 4273 4274 # Find chomosomes 4275 log.debug("Find chromosomes ") 4276 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4277 sql_query_chromosomes_df = self.get_query_to_df( 4278 sql_query_chromosomes 4279 ) 4280 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4281 4282 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4283 4284 # BED columns in the annotation file 4285 if db_file_type in ["bed"]: 4286 annotation_infos = "CHROM,POS,POS," + annotation_infos 4287 4288 for chrom in chomosomes_list: 4289 4290 # Create BED on initial VCF 4291 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4292 tmp_bed = NamedTemporaryFile( 4293 prefix=self.get_prefix(), 4294 dir=self.get_tmp_dir(), 4295 suffix=".bed", 4296 delete=False, 4297 ) 4298 tmp_bed_name = tmp_bed.name 4299 tmp_files.append(tmp_bed_name) 4300 4301 # Detecte regions 4302 log.debug( 4303 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4304 ) 4305 window = 1000000 4306 sql_query_intervals_for_bed = f""" 4307 SELECT \"#CHROM\", 4308 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4309 \"POS\"+{window} 4310 FROM {table_variants} as table_variants 4311 WHERE table_variants.\"#CHROM\" = '{chrom}' 4312 """ 4313 regions = self.conn.execute( 4314 sql_query_intervals_for_bed 4315 ).fetchall() 4316 merged_regions = merge_regions(regions) 4317 log.debug( 4318 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4319 ) 4320 4321 header = ["#CHROM", "START", "END"] 4322 with open(tmp_bed_name, "w") as f: 4323 # Write the header with tab delimiter 4324 f.write("\t".join(header) + "\n") 4325 for d in merged_regions: 4326 # Write each data row with tab delimiter 4327 f.write("\t".join(map(str, d)) + "\n") 4328 4329 # Tmp files 4330 tmp_annotation_vcf = NamedTemporaryFile( 4331 prefix=self.get_prefix(), 4332 dir=self.get_tmp_dir(), 4333 suffix=".vcf.gz", 4334 delete=False, 4335 ) 4336 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4337 tmp_files.append(tmp_annotation_vcf_name) 4338 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4339 tmp_annotation_vcf_name_err = ( 4340 tmp_annotation_vcf_name + ".err" 4341 ) 4342 err_files.append(tmp_annotation_vcf_name_err) 4343 4344 # Annotate Command 4345 log.debug( 4346 f"Annotation '{annotation}' - add bcftools command" 4347 ) 4348 4349 # Command 4350 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4351 4352 # Add command 4353 commands.append(command_annotate) 4354 4355 # if some commands 4356 if commands: 4357 4358 # Export VCF file 4359 self.export_variant_vcf( 4360 vcf_file=tmp_vcf_name, 4361 remove_info=True, 4362 add_samples=False, 4363 index=True, 4364 ) 4365 4366 # Threads 4367 # calculate threads for annotated commands 4368 if commands: 4369 threads_bcftools_annotate = round(threads / len(commands)) 4370 else: 4371 threads_bcftools_annotate = 1 4372 4373 if not threads_bcftools_annotate: 4374 threads_bcftools_annotate = 1 4375 4376 # Add threads option to bcftools commands 4377 if threads_bcftools_annotate > 1: 4378 commands_threaded = [] 4379 for command in commands: 4380 commands_threaded.append( 4381 command.replace( 4382 f"{bcftools_bin_command} annotate ", 4383 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4384 ) 4385 ) 4386 commands = commands_threaded 4387 4388 # Command annotation multithreading 4389 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4390 log.info( 4391 f"Annotation - Annotation multithreaded in " 4392 + str(len(commands)) 4393 + " commands" 4394 ) 4395 4396 run_parallel_commands(commands, threads) 4397 4398 # Merge 4399 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4400 4401 if tmp_ann_vcf_list_cmd: 4402 4403 # Tmp file 4404 tmp_annotate_vcf = NamedTemporaryFile( 4405 prefix=self.get_prefix(), 4406 dir=self.get_tmp_dir(), 4407 suffix=".vcf.gz", 4408 delete=True, 4409 ) 4410 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4411 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4412 err_files.append(tmp_annotate_vcf_name_err) 4413 4414 # Tmp file remove command 4415 tmp_files_remove_command = "" 4416 if tmp_files: 4417 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4418 4419 # Command merge 4420 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4421 log.info( 4422 f"Annotation - Annotation merging " 4423 + str(len(commands)) 4424 + " annotated files" 4425 ) 4426 log.debug(f"Annotation - merge command: {merge_command}") 4427 run_parallel_commands([merge_command], 1) 4428 4429 # Error messages 4430 log.info(f"Error/Warning messages:") 4431 error_message_command_all = [] 4432 error_message_command_warning = [] 4433 error_message_command_err = [] 4434 for err_file in err_files: 4435 with open(err_file, "r") as f: 4436 for line in f: 4437 message = line.strip() 4438 error_message_command_all.append(message) 4439 if line.startswith("[W::"): 4440 error_message_command_warning.append(message) 4441 if line.startswith("[E::"): 4442 error_message_command_err.append( 4443 f"{err_file}: " + message 4444 ) 4445 # log info 4446 for message in list( 4447 set(error_message_command_err + error_message_command_warning) 4448 ): 4449 log.info(f" {message}") 4450 # debug info 4451 for message in list(set(error_message_command_all)): 4452 log.debug(f" {message}") 4453 # failed 4454 if len(error_message_command_err): 4455 log.error("Annotation failed: Error in commands") 4456 raise ValueError("Annotation failed: Error in commands") 4457 4458 # Update variants 4459 log.info(f"Annotation - Updating...") 4460 self.update_from_vcf(tmp_annotate_vcf_name) 4461 4462 def annotation_exomiser(self, threads: int = None) -> None: 4463 """ 4464 This function annotate with Exomiser 4465 4466 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4467 - "analysis" (dict/file): 4468 Full analysis dictionnary parameters (see Exomiser docs). 4469 Either a dict, or a file in JSON or YAML format. 4470 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4471 Default : None 4472 - "preset" (string): 4473 Analysis preset (available in config folder). 4474 Used if no full "analysis" is provided. 4475 Default: "exome" 4476 - "phenopacket" (dict/file): 4477 Samples and phenotipic features parameters (see Exomiser docs). 4478 Either a dict, or a file in JSON or YAML format. 4479 Default: None 4480 - "subject" (dict): 4481 Sample parameters (see Exomiser docs). 4482 Example: 4483 "subject": 4484 { 4485 "id": "ISDBM322017", 4486 "sex": "FEMALE" 4487 } 4488 Default: None 4489 - "sample" (string): 4490 Sample name to construct "subject" section: 4491 "subject": 4492 { 4493 "id": "<sample>", 4494 "sex": "UNKNOWN_SEX" 4495 } 4496 Default: None 4497 - "phenotypicFeatures" (dict) 4498 Phenotypic features to construct "subject" section. 4499 Example: 4500 "phenotypicFeatures": 4501 [ 4502 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4503 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4504 ] 4505 - "hpo" (list) 4506 List of HPO ids as phenotypic features. 4507 Example: 4508 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4509 Default: [] 4510 - "outputOptions" (dict): 4511 Output options (see Exomiser docs). 4512 Default: 4513 "output_options" = 4514 { 4515 "outputContributingVariantsOnly": False, 4516 "numGenes": 0, 4517 "outputFormats": ["TSV_VARIANT", "VCF"] 4518 } 4519 - "transcript_source" (string): 4520 Transcript source (either "refseq", "ucsc", "ensembl") 4521 Default: "refseq" 4522 - "exomiser_to_info" (boolean): 4523 Add exomiser TSV file columns as INFO fields in VCF. 4524 Default: False 4525 - "release" (string): 4526 Exomise database release. 4527 If not exists, database release will be downloaded (take a while). 4528 Default: None (provided by application.properties configuration file) 4529 - "exomiser_application_properties" (file): 4530 Exomiser configuration file (see Exomiser docs). 4531 Useful to automatically download databases (especially for specific genome databases). 4532 4533 Notes: 4534 - If no sample in parameters, first sample in VCF will be chosen 4535 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4536 4537 :param threads: The number of threads to use 4538 :return: None. 4539 """ 4540 4541 # DEBUG 4542 log.debug("Start annotation with Exomiser databases") 4543 4544 # Threads 4545 if not threads: 4546 threads = self.get_threads() 4547 log.debug("Threads: " + str(threads)) 4548 4549 # Config 4550 config = self.get_config() 4551 log.debug("Config: " + str(config)) 4552 4553 # Config - Folders - Databases 4554 databases_folders = ( 4555 config.get("folders", {}) 4556 .get("databases", {}) 4557 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4558 ) 4559 databases_folders = full_path(databases_folders) 4560 if not os.path.exists(databases_folders): 4561 log.error(f"Databases annotations: {databases_folders} NOT found") 4562 log.debug("Databases annotations: " + str(databases_folders)) 4563 4564 # Config - Exomiser 4565 exomiser_bin_command = get_bin_command( 4566 bin="exomiser-cli*.jar", 4567 tool="exomiser", 4568 bin_type="jar", 4569 config=config, 4570 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4571 ) 4572 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4573 if not exomiser_bin_command: 4574 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4575 log.error(msg_err) 4576 raise ValueError(msg_err) 4577 4578 # Param 4579 param = self.get_param() 4580 log.debug("Param: " + str(param)) 4581 4582 # Param - Exomiser 4583 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4584 log.debug(f"Param Exomiser: {param_exomiser}") 4585 4586 # Param - Assembly 4587 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4588 log.debug("Assembly: " + str(assembly)) 4589 4590 # Data 4591 table_variants = self.get_table_variants() 4592 4593 # Check if not empty 4594 log.debug("Check if not empty") 4595 sql_query_chromosomes = ( 4596 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4597 ) 4598 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4599 log.info(f"VCF empty") 4600 return False 4601 4602 # VCF header 4603 vcf_reader = self.get_header() 4604 log.debug("Initial header: " + str(vcf_reader.infos)) 4605 4606 # Samples 4607 samples = self.get_header_sample_list() 4608 if not samples: 4609 log.error("No Samples in VCF") 4610 return False 4611 log.debug(f"Samples: {samples}") 4612 4613 # Memory limit 4614 memory_limit = self.get_memory("8G") 4615 log.debug(f"memory_limit: {memory_limit}") 4616 4617 # Exomiser java options 4618 exomiser_java_options = ( 4619 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4620 ) 4621 log.debug(f"Exomiser java options: {exomiser_java_options}") 4622 4623 # Download Exomiser (if not exists) 4624 exomiser_release = param_exomiser.get("release", None) 4625 exomiser_application_properties = param_exomiser.get( 4626 "exomiser_application_properties", None 4627 ) 4628 databases_download_exomiser( 4629 assemblies=[assembly], 4630 exomiser_folder=databases_folders, 4631 exomiser_release=exomiser_release, 4632 exomiser_phenotype_release=exomiser_release, 4633 exomiser_application_properties=exomiser_application_properties, 4634 ) 4635 4636 # Force annotation 4637 force_update_annotation = True 4638 4639 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4640 log.debug("Start annotation Exomiser") 4641 4642 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4643 4644 # tmp_dir = "/tmp/exomiser" 4645 4646 ### ANALYSIS ### 4647 ################ 4648 4649 # Create analysis.json through analysis dict 4650 # either analysis in param or by default 4651 # depending on preset exome/genome) 4652 4653 # Init analysis dict 4654 param_exomiser_analysis_dict = {} 4655 4656 # analysis from param 4657 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4658 param_exomiser_analysis = full_path(param_exomiser_analysis) 4659 4660 # If analysis in param -> load anlaysis json 4661 if param_exomiser_analysis: 4662 4663 # If param analysis is a file and exists 4664 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4665 param_exomiser_analysis 4666 ): 4667 # Load analysis file into analysis dict (either yaml or json) 4668 with open(param_exomiser_analysis) as json_file: 4669 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4670 4671 # If param analysis is a dict 4672 elif isinstance(param_exomiser_analysis, dict): 4673 # Load analysis dict into analysis dict (either yaml or json) 4674 param_exomiser_analysis_dict = param_exomiser_analysis 4675 4676 # Error analysis type 4677 else: 4678 log.error(f"Analysis type unknown. Check param file.") 4679 raise ValueError(f"Analysis type unknown. Check param file.") 4680 4681 # Case no input analysis config file/dict 4682 # Use preset (exome/genome) to open default config file 4683 if not param_exomiser_analysis_dict: 4684 4685 # default preset 4686 default_preset = "exome" 4687 4688 # Get param preset or default preset 4689 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4690 4691 # Try to find if preset is a file 4692 if os.path.exists(param_exomiser_preset): 4693 # Preset file is provided in full path 4694 param_exomiser_analysis_default_config_file = ( 4695 param_exomiser_preset 4696 ) 4697 # elif os.path.exists(full_path(param_exomiser_preset)): 4698 # # Preset file is provided in full path 4699 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4700 elif os.path.exists( 4701 os.path.join(folder_config, param_exomiser_preset) 4702 ): 4703 # Preset file is provided a basename in config folder (can be a path with subfolders) 4704 param_exomiser_analysis_default_config_file = os.path.join( 4705 folder_config, param_exomiser_preset 4706 ) 4707 else: 4708 # Construct preset file 4709 param_exomiser_analysis_default_config_file = os.path.join( 4710 folder_config, 4711 f"preset-{param_exomiser_preset}-analysis.json", 4712 ) 4713 4714 # If preset file exists 4715 param_exomiser_analysis_default_config_file = full_path( 4716 param_exomiser_analysis_default_config_file 4717 ) 4718 if os.path.exists(param_exomiser_analysis_default_config_file): 4719 # Load prest file into analysis dict (either yaml or json) 4720 with open( 4721 param_exomiser_analysis_default_config_file 4722 ) as json_file: 4723 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4724 json_file 4725 ) 4726 4727 # Error preset file 4728 else: 4729 log.error( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 raise ValueError( 4733 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4734 ) 4735 4736 # If no analysis dict created 4737 if not param_exomiser_analysis_dict: 4738 log.error(f"No analysis config") 4739 raise ValueError(f"No analysis config") 4740 4741 # Log 4742 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4743 4744 ### PHENOPACKET ### 4745 ################### 4746 4747 # If no PhenoPacket in analysis dict -> check in param 4748 if "phenopacket" not in param_exomiser_analysis_dict: 4749 4750 # If PhenoPacket in param -> load anlaysis json 4751 if param_exomiser.get("phenopacket", None): 4752 4753 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4754 param_exomiser_phenopacket = full_path( 4755 param_exomiser_phenopacket 4756 ) 4757 4758 # If param phenopacket is a file and exists 4759 if isinstance( 4760 param_exomiser_phenopacket, str 4761 ) and os.path.exists(param_exomiser_phenopacket): 4762 # Load phenopacket file into analysis dict (either yaml or json) 4763 with open(param_exomiser_phenopacket) as json_file: 4764 param_exomiser_analysis_dict["phenopacket"] = ( 4765 yaml.safe_load(json_file) 4766 ) 4767 4768 # If param phenopacket is a dict 4769 elif isinstance(param_exomiser_phenopacket, dict): 4770 # Load phenopacket dict into analysis dict (either yaml or json) 4771 param_exomiser_analysis_dict["phenopacket"] = ( 4772 param_exomiser_phenopacket 4773 ) 4774 4775 # Error phenopacket type 4776 else: 4777 log.error(f"Phenopacket type unknown. Check param file.") 4778 raise ValueError( 4779 f"Phenopacket type unknown. Check param file." 4780 ) 4781 4782 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4783 if "phenopacket" not in param_exomiser_analysis_dict: 4784 4785 # Init PhenoPacket 4786 param_exomiser_analysis_dict["phenopacket"] = { 4787 "id": "analysis", 4788 "proband": {}, 4789 } 4790 4791 ### Add subject ### 4792 4793 # If subject exists 4794 param_exomiser_subject = param_exomiser.get("subject", {}) 4795 4796 # If subject not exists -> found sample ID 4797 if not param_exomiser_subject: 4798 4799 # Found sample ID in param 4800 sample = param_exomiser.get("sample", None) 4801 4802 # Find sample ID (first sample) 4803 if not sample: 4804 sample_list = self.get_header_sample_list() 4805 if len(sample_list) > 0: 4806 sample = sample_list[0] 4807 else: 4808 log.error(f"No sample found") 4809 raise ValueError(f"No sample found") 4810 4811 # Create subject 4812 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4813 4814 # Add to dict 4815 param_exomiser_analysis_dict["phenopacket"][ 4816 "subject" 4817 ] = param_exomiser_subject 4818 4819 ### Add "phenotypicFeatures" ### 4820 4821 # If phenotypicFeatures exists 4822 param_exomiser_phenotypicfeatures = param_exomiser.get( 4823 "phenotypicFeatures", [] 4824 ) 4825 4826 # If phenotypicFeatures not exists -> Try to infer from hpo list 4827 if not param_exomiser_phenotypicfeatures: 4828 4829 # Found HPO in param 4830 param_exomiser_hpo = param_exomiser.get("hpo", []) 4831 4832 # Split HPO if list in string format separated by comma 4833 if isinstance(param_exomiser_hpo, str): 4834 param_exomiser_hpo = param_exomiser_hpo.split(",") 4835 4836 # Create HPO list 4837 for hpo in param_exomiser_hpo: 4838 hpo_clean = re.sub("[^0-9]", "", hpo) 4839 param_exomiser_phenotypicfeatures.append( 4840 { 4841 "type": { 4842 "id": f"HP:{hpo_clean}", 4843 "label": f"HP:{hpo_clean}", 4844 } 4845 } 4846 ) 4847 4848 # Add to dict 4849 param_exomiser_analysis_dict["phenopacket"][ 4850 "phenotypicFeatures" 4851 ] = param_exomiser_phenotypicfeatures 4852 4853 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4854 if not param_exomiser_phenotypicfeatures: 4855 for step in param_exomiser_analysis_dict.get( 4856 "analysis", {} 4857 ).get("steps", []): 4858 if "hiPhivePrioritiser" in step: 4859 param_exomiser_analysis_dict.get("analysis", {}).get( 4860 "steps", [] 4861 ).remove(step) 4862 4863 ### Add Input File ### 4864 4865 # Initial file name and htsFiles 4866 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4867 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4868 { 4869 "uri": tmp_vcf_name, 4870 "htsFormat": "VCF", 4871 "genomeAssembly": assembly, 4872 } 4873 ] 4874 4875 ### Add metaData ### 4876 4877 # If metaData not in analysis dict 4878 if "metaData" not in param_exomiser_analysis_dict: 4879 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4880 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4881 "createdBy": "howard", 4882 "phenopacketSchemaVersion": 1, 4883 } 4884 4885 ### OutputOptions ### 4886 4887 # Init output result folder 4888 output_results = os.path.join(tmp_dir, "results") 4889 4890 # If no outputOptions in analysis dict 4891 if "outputOptions" not in param_exomiser_analysis_dict: 4892 4893 # default output formats 4894 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4895 4896 # Get outputOptions in param 4897 output_options = param_exomiser.get("outputOptions", None) 4898 4899 # If no output_options in param -> check 4900 if not output_options: 4901 output_options = { 4902 "outputContributingVariantsOnly": False, 4903 "numGenes": 0, 4904 "outputFormats": defaut_output_formats, 4905 } 4906 4907 # Replace outputDirectory in output options 4908 output_options["outputDirectory"] = output_results 4909 output_options["outputFileName"] = "howard" 4910 4911 # Add outputOptions in analysis dict 4912 param_exomiser_analysis_dict["outputOptions"] = output_options 4913 4914 else: 4915 4916 # Replace output_results and output format (if exists in param) 4917 param_exomiser_analysis_dict["outputOptions"][ 4918 "outputDirectory" 4919 ] = output_results 4920 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4921 list( 4922 set( 4923 param_exomiser_analysis_dict.get( 4924 "outputOptions", {} 4925 ).get("outputFormats", []) 4926 + ["TSV_VARIANT", "VCF"] 4927 ) 4928 ) 4929 ) 4930 4931 # log 4932 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4933 4934 ### ANALYSIS FILE ### 4935 ##################### 4936 4937 ### Full JSON analysis config file ### 4938 4939 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4940 with open(exomiser_analysis, "w") as fp: 4941 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4942 4943 ### SPLIT analysis and sample config files 4944 4945 # Splitted analysis dict 4946 param_exomiser_analysis_dict_for_split = ( 4947 param_exomiser_analysis_dict.copy() 4948 ) 4949 4950 # Phenopacket JSON file 4951 exomiser_analysis_phenopacket = os.path.join( 4952 tmp_dir, "analysis_phenopacket.json" 4953 ) 4954 with open(exomiser_analysis_phenopacket, "w") as fp: 4955 json.dump( 4956 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4957 fp, 4958 indent=4, 4959 ) 4960 4961 # Analysis JSON file without Phenopacket parameters 4962 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4963 exomiser_analysis_analysis = os.path.join( 4964 tmp_dir, "analysis_analysis.json" 4965 ) 4966 with open(exomiser_analysis_analysis, "w") as fp: 4967 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4968 4969 ### INITAL VCF file ### 4970 ####################### 4971 4972 ### Create list of samples to use and include inti initial VCF file #### 4973 4974 # Subject (main sample) 4975 # Get sample ID in analysis dict 4976 sample_subject = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("subject", {}) 4979 .get("id", None) 4980 ) 4981 sample_proband = ( 4982 param_exomiser_analysis_dict.get("phenopacket", {}) 4983 .get("proband", {}) 4984 .get("subject", {}) 4985 .get("id", None) 4986 ) 4987 sample = [] 4988 if sample_subject: 4989 sample.append(sample_subject) 4990 if sample_proband: 4991 sample.append(sample_proband) 4992 4993 # Get sample ID within Pedigree 4994 pedigree_persons_list = ( 4995 param_exomiser_analysis_dict.get("phenopacket", {}) 4996 .get("pedigree", {}) 4997 .get("persons", {}) 4998 ) 4999 5000 # Create list with all sample ID in pedigree (if exists) 5001 pedigree_persons = [] 5002 for person in pedigree_persons_list: 5003 pedigree_persons.append(person.get("individualId")) 5004 5005 # Concat subject sample ID and samples ID in pedigreesamples 5006 samples = list(set(sample + pedigree_persons)) 5007 5008 # Check if sample list is not empty 5009 if not samples: 5010 log.error(f"No samples found") 5011 raise ValueError(f"No samples found") 5012 5013 # Create VCF with sample (either sample in param or first one by default) 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=True, 5019 list_samples=samples, 5020 index=False, 5021 ) 5022 5023 ### Execute Exomiser ### 5024 ######################## 5025 5026 # Init command 5027 exomiser_command = "" 5028 5029 # Command exomiser options 5030 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5031 5032 # Release 5033 exomiser_release = param_exomiser.get("release", None) 5034 if exomiser_release: 5035 # phenotype data version 5036 exomiser_options += ( 5037 f" --exomiser.phenotype.data-version={exomiser_release} " 5038 ) 5039 # data version 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.data-version={exomiser_release} " 5042 ) 5043 # variant white list 5044 variant_white_list_file = ( 5045 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5046 ) 5047 if os.path.exists( 5048 os.path.join( 5049 databases_folders, assembly, variant_white_list_file 5050 ) 5051 ): 5052 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5053 5054 # transcript_source 5055 transcript_source = param_exomiser.get( 5056 "transcript_source", None 5057 ) # ucsc, refseq, ensembl 5058 if transcript_source: 5059 exomiser_options += ( 5060 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5061 ) 5062 5063 # If analysis contain proband param 5064 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5065 "proband", {} 5066 ): 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5068 5069 # If no proband (usually uniq sample) 5070 else: 5071 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5072 5073 # Log 5074 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5075 5076 # Run command 5077 result = subprocess.call( 5078 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5079 ) 5080 if result: 5081 log.error("Exomiser command failed") 5082 raise ValueError("Exomiser command failed") 5083 5084 ### RESULTS ### 5085 ############### 5086 5087 ### Annotate with TSV fields ### 5088 5089 # Init result tsv file 5090 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5091 5092 # Init result tsv file 5093 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5094 5095 # Parse TSV file and explode columns in INFO field 5096 if exomiser_to_info and os.path.exists(output_results_tsv): 5097 5098 # Log 5099 log.debug("Exomiser columns to VCF INFO field") 5100 5101 # Retrieve columns and types 5102 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5103 output_results_tsv_df = self.get_query_to_df(query) 5104 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5105 5106 # Init concat fields for update 5107 sql_query_update_concat_fields = [] 5108 5109 # Fields to avoid 5110 fields_to_avoid = [ 5111 "CONTIG", 5112 "START", 5113 "END", 5114 "REF", 5115 "ALT", 5116 "QUAL", 5117 "FILTER", 5118 "GENOTYPE", 5119 ] 5120 5121 # List all columns to add into header 5122 for header_column in output_results_tsv_columns: 5123 5124 # If header column is enable 5125 if header_column not in fields_to_avoid: 5126 5127 # Header info type 5128 header_info_type = "String" 5129 header_column_df = output_results_tsv_df[header_column] 5130 header_column_df_dtype = header_column_df.dtype 5131 if header_column_df_dtype == object: 5132 if ( 5133 pd.to_numeric(header_column_df, errors="coerce") 5134 .notnull() 5135 .all() 5136 ): 5137 header_info_type = "Float" 5138 else: 5139 header_info_type = "Integer" 5140 5141 # Header info 5142 characters_to_validate = ["-"] 5143 pattern = "[" + "".join(characters_to_validate) + "]" 5144 header_info_name = re.sub( 5145 pattern, 5146 "_", 5147 f"Exomiser_{header_column}".replace("#", ""), 5148 ) 5149 header_info_number = "." 5150 header_info_description = ( 5151 f"Exomiser {header_column} annotation" 5152 ) 5153 header_info_source = "Exomiser" 5154 header_info_version = "unknown" 5155 header_info_code = CODE_TYPE_MAP[header_info_type] 5156 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5157 header_info_name, 5158 header_info_number, 5159 header_info_type, 5160 header_info_description, 5161 header_info_source, 5162 header_info_version, 5163 header_info_code, 5164 ) 5165 5166 # Add field to add for update to concat fields 5167 sql_query_update_concat_fields.append( 5168 f""" 5169 CASE 5170 WHEN table_parquet."{header_column}" NOT IN ('','.') 5171 THEN concat( 5172 '{header_info_name}=', 5173 table_parquet."{header_column}", 5174 ';' 5175 ) 5176 5177 ELSE '' 5178 END 5179 """ 5180 ) 5181 5182 # Update query 5183 sql_query_update = f""" 5184 UPDATE {table_variants} as table_variants 5185 SET INFO = concat( 5186 CASE 5187 WHEN INFO NOT IN ('', '.') 5188 THEN INFO 5189 ELSE '' 5190 END, 5191 CASE 5192 WHEN table_variants.INFO NOT IN ('','.') 5193 THEN ';' 5194 ELSE '' 5195 END, 5196 ( 5197 SELECT 5198 concat( 5199 {",".join(sql_query_update_concat_fields)} 5200 ) 5201 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5202 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5203 AND table_parquet.\"START\" = table_variants.\"POS\" 5204 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5205 AND table_parquet.\"REF\" = table_variants.\"REF\" 5206 ) 5207 ) 5208 ; 5209 """ 5210 5211 # Update 5212 self.conn.execute(sql_query_update) 5213 5214 ### Annotate with VCF INFO field ### 5215 5216 # Init result VCF file 5217 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5218 5219 # If VCF exists 5220 if os.path.exists(output_results_vcf): 5221 5222 # Log 5223 log.debug("Exomiser result VCF update variants") 5224 5225 # Find Exomiser INFO field annotation in header 5226 with gzip.open(output_results_vcf, "rt") as f: 5227 header_list = self.read_vcf_header(f) 5228 exomiser_vcf_header = vcf.Reader( 5229 io.StringIO("\n".join(header_list)) 5230 ) 5231 5232 # Add annotation INFO field to header 5233 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5234 5235 # Update variants with VCF 5236 self.update_from_vcf(output_results_vcf) 5237 5238 return True 5239 5240 def annotation_snpeff(self, threads: int = None) -> None: 5241 """ 5242 This function annotate with snpEff 5243 5244 :param threads: The number of threads to use 5245 :return: the value of the variable "return_value". 5246 """ 5247 5248 # DEBUG 5249 log.debug("Start annotation with snpeff databases") 5250 5251 # Threads 5252 if not threads: 5253 threads = self.get_threads() 5254 log.debug("Threads: " + str(threads)) 5255 5256 # DEBUG 5257 delete_tmp = True 5258 if self.get_config().get("verbosity", "warning") in ["debug"]: 5259 delete_tmp = False 5260 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5261 5262 # Config 5263 config = self.get_config() 5264 log.debug("Config: " + str(config)) 5265 5266 # Config - Folders - Databases 5267 databases_folders = ( 5268 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5269 ) 5270 log.debug("Databases annotations: " + str(databases_folders)) 5271 5272 # Config - snpEff bin command 5273 snpeff_bin_command = get_bin_command( 5274 bin="snpEff.jar", 5275 tool="snpeff", 5276 bin_type="jar", 5277 config=config, 5278 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5279 ) 5280 if not snpeff_bin_command: 5281 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5282 log.error(msg_err) 5283 raise ValueError(msg_err) 5284 5285 # Config - snpEff databases 5286 snpeff_databases = ( 5287 config.get("folders", {}) 5288 .get("databases", {}) 5289 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5290 ) 5291 snpeff_databases = full_path(snpeff_databases) 5292 if snpeff_databases is not None and snpeff_databases != "": 5293 log.debug(f"Create snpEff databases folder") 5294 if not os.path.exists(snpeff_databases): 5295 os.makedirs(snpeff_databases) 5296 5297 # Param 5298 param = self.get_param() 5299 log.debug("Param: " + str(param)) 5300 5301 # Param 5302 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5303 log.debug("Options: " + str(options)) 5304 5305 # Param - Assembly 5306 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5307 5308 # Param - Options 5309 snpeff_options = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5311 ) 5312 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5313 snpeff_csvstats = ( 5314 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5315 ) 5316 if snpeff_stats: 5317 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5318 snpeff_stats = full_path(snpeff_stats) 5319 snpeff_options += f" -stats {snpeff_stats}" 5320 if snpeff_csvstats: 5321 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5322 snpeff_csvstats = full_path(snpeff_csvstats) 5323 snpeff_options += f" -csvStats {snpeff_csvstats}" 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes = ( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5332 ) 5333 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5334 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # Export in VCF 5339 log.debug("Create initial file to annotate") 5340 tmp_vcf = NamedTemporaryFile( 5341 prefix=self.get_prefix(), 5342 dir=self.get_tmp_dir(), 5343 suffix=".vcf.gz", 5344 delete=True, 5345 ) 5346 tmp_vcf_name = tmp_vcf.name 5347 5348 # VCF header 5349 vcf_reader = self.get_header() 5350 log.debug("Initial header: " + str(vcf_reader.infos)) 5351 5352 # Existing annotations 5353 for vcf_annotation in self.get_header().infos: 5354 5355 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5356 log.debug( 5357 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5358 ) 5359 5360 # Memory limit 5361 # if config.get("memory", None): 5362 # memory_limit = config.get("memory", "8G") 5363 # else: 5364 # memory_limit = "8G" 5365 memory_limit = self.get_memory("8G") 5366 log.debug(f"memory_limit: {memory_limit}") 5367 5368 # snpEff java options 5369 snpeff_java_options = ( 5370 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5371 ) 5372 log.debug(f"Exomiser java options: {snpeff_java_options}") 5373 5374 force_update_annotation = True 5375 5376 if "ANN" not in self.get_header().infos or force_update_annotation: 5377 5378 # Check snpEff database 5379 log.debug(f"Check snpEff databases {[assembly]}") 5380 databases_download_snpeff( 5381 folder=snpeff_databases, assemblies=[assembly], config=config 5382 ) 5383 5384 # Export VCF file 5385 self.export_variant_vcf( 5386 vcf_file=tmp_vcf_name, 5387 remove_info=True, 5388 add_samples=False, 5389 index=True, 5390 ) 5391 5392 # Tmp file 5393 err_files = [] 5394 tmp_annotate_vcf = NamedTemporaryFile( 5395 prefix=self.get_prefix(), 5396 dir=self.get_tmp_dir(), 5397 suffix=".vcf", 5398 delete=False, 5399 ) 5400 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5401 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5402 err_files.append(tmp_annotate_vcf_name_err) 5403 5404 # Command 5405 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5406 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5407 run_parallel_commands([snpeff_command], 1) 5408 5409 # Error messages 5410 log.info(f"Error/Warning messages:") 5411 error_message_command_all = [] 5412 error_message_command_warning = [] 5413 error_message_command_err = [] 5414 for err_file in err_files: 5415 with open(err_file, "r") as f: 5416 for line in f: 5417 message = line.strip() 5418 error_message_command_all.append(message) 5419 if line.startswith("[W::"): 5420 error_message_command_warning.append(message) 5421 if line.startswith("[E::"): 5422 error_message_command_err.append(f"{err_file}: " + message) 5423 # log info 5424 for message in list( 5425 set(error_message_command_err + error_message_command_warning) 5426 ): 5427 log.info(f" {message}") 5428 # debug info 5429 for message in list(set(error_message_command_all)): 5430 log.debug(f" {message}") 5431 # failed 5432 if len(error_message_command_err): 5433 log.error("Annotation failed: Error in commands") 5434 raise ValueError("Annotation failed: Error in commands") 5435 5436 # Find annotation in header 5437 with open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 else: 5450 if "ANN" in self.get_header().infos: 5451 log.debug(f"Existing snpEff annotations in VCF") 5452 if force_update_annotation: 5453 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5454 5455 def annotation_annovar(self, threads: int = None) -> None: 5456 """ 5457 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5458 annotations 5459 5460 :param threads: number of threads to use 5461 :return: the value of the variable "return_value". 5462 """ 5463 5464 # DEBUG 5465 log.debug("Start annotation with Annovar databases") 5466 5467 # Threads 5468 if not threads: 5469 threads = self.get_threads() 5470 log.debug("Threads: " + str(threads)) 5471 5472 # Tmp en Err files 5473 tmp_files = [] 5474 err_files = [] 5475 5476 # DEBUG 5477 delete_tmp = True 5478 if self.get_config().get("verbosity", "warning") in ["debug"]: 5479 delete_tmp = False 5480 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5481 5482 # Config 5483 config = self.get_config() 5484 log.debug("Config: " + str(config)) 5485 5486 # Config - Folders - Databases 5487 databases_folders = ( 5488 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5489 ) 5490 log.debug("Databases annotations: " + str(databases_folders)) 5491 5492 # Config - annovar bin command 5493 annovar_bin_command = get_bin_command( 5494 bin="table_annovar.pl", 5495 tool="annovar", 5496 bin_type="perl", 5497 config=config, 5498 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5499 ) 5500 if not annovar_bin_command: 5501 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5502 log.error(msg_err) 5503 raise ValueError(msg_err) 5504 5505 # Config - BCFTools bin command 5506 bcftools_bin_command = get_bin_command( 5507 bin="bcftools", 5508 tool="bcftools", 5509 bin_type="bin", 5510 config=config, 5511 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5512 ) 5513 if not bcftools_bin_command: 5514 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Config - annovar databases 5519 annovar_databases = ( 5520 config.get("folders", {}) 5521 .get("databases", {}) 5522 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5523 ) 5524 if annovar_databases is not None: 5525 if isinstance(annovar_databases, list): 5526 annovar_databases = full_path(annovar_databases[0]) 5527 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5528 annovar_databases = full_path(annovar_databases) 5529 if not os.path.exists(annovar_databases): 5530 log.info(f"Annovar databases folder '{annovar_databases}' created") 5531 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5532 else: 5533 msg_err = f"Annovar databases configuration failed" 5534 log.error(msg_err) 5535 raise ValueError(msg_err) 5536 5537 # Param 5538 param = self.get_param() 5539 log.debug("Param: " + str(param)) 5540 5541 # Param - options 5542 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5543 log.debug("Options: " + str(options)) 5544 5545 # Param - annotations 5546 annotations = ( 5547 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5548 ) 5549 log.debug("Annotations: " + str(annotations)) 5550 5551 # Param - Assembly 5552 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5553 5554 # Annovar database assembly 5555 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5556 if annovar_databases_assembly != "" and not os.path.exists( 5557 annovar_databases_assembly 5558 ): 5559 os.makedirs(annovar_databases_assembly) 5560 5561 # Data 5562 table_variants = self.get_table_variants() 5563 5564 # Check if not empty 5565 log.debug("Check if not empty") 5566 sql_query_chromosomes = ( 5567 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5568 ) 5569 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5570 if not sql_query_chromosomes_df["count"][0]: 5571 log.info(f"VCF empty") 5572 return 5573 5574 # VCF header 5575 vcf_reader = self.get_header() 5576 log.debug("Initial header: " + str(vcf_reader.infos)) 5577 5578 # Existing annotations 5579 for vcf_annotation in self.get_header().infos: 5580 5581 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5582 log.debug( 5583 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5584 ) 5585 5586 force_update_annotation = True 5587 5588 if annotations: 5589 5590 commands = [] 5591 tmp_annotates_vcf_name_list = [] 5592 5593 # Export in VCF 5594 log.debug("Create initial file to annotate") 5595 tmp_vcf = NamedTemporaryFile( 5596 prefix=self.get_prefix(), 5597 dir=self.get_tmp_dir(), 5598 suffix=".vcf.gz", 5599 delete=False, 5600 ) 5601 tmp_vcf_name = tmp_vcf.name 5602 tmp_files.append(tmp_vcf_name) 5603 tmp_files.append(tmp_vcf_name + ".tbi") 5604 5605 # Export VCF file 5606 self.export_variant_vcf( 5607 vcf_file=tmp_vcf_name, 5608 remove_info=".", 5609 add_samples=False, 5610 index=True, 5611 ) 5612 5613 # Create file for field rename 5614 log.debug("Create file for field rename") 5615 tmp_rename = NamedTemporaryFile( 5616 prefix=self.get_prefix(), 5617 dir=self.get_tmp_dir(), 5618 suffix=".rename", 5619 delete=False, 5620 ) 5621 tmp_rename_name = tmp_rename.name 5622 tmp_files.append(tmp_rename_name) 5623 5624 # Check Annovar database 5625 log.debug( 5626 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5627 ) 5628 databases_download_annovar( 5629 folder=annovar_databases, 5630 files=list(annotations.keys()), 5631 assemblies=[assembly], 5632 ) 5633 5634 for annotation in annotations: 5635 annotation_fields = annotations[annotation] 5636 5637 if not annotation_fields: 5638 annotation_fields = {"INFO": None} 5639 5640 log.info(f"Annotations Annovar - database '{annotation}'") 5641 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5642 5643 # Tmp file for annovar 5644 err_files = [] 5645 tmp_annotate_vcf_directory = TemporaryDirectory( 5646 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5647 ) 5648 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5649 tmp_annotate_vcf_name_annovar = ( 5650 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5651 ) 5652 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5653 err_files.append(tmp_annotate_vcf_name_err) 5654 tmp_files.append(tmp_annotate_vcf_name_err) 5655 5656 # Tmp file final vcf annotated by annovar 5657 tmp_annotate_vcf = NamedTemporaryFile( 5658 prefix=self.get_prefix(), 5659 dir=self.get_tmp_dir(), 5660 suffix=".vcf.gz", 5661 delete=False, 5662 ) 5663 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5664 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name) 5666 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5667 5668 # Number of fields 5669 annotation_list = [] 5670 annotation_renamed_list = [] 5671 5672 for annotation_field in annotation_fields: 5673 5674 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5675 annotation_fields_new_name = annotation_fields.get( 5676 annotation_field, annotation_field 5677 ) 5678 if not annotation_fields_new_name: 5679 annotation_fields_new_name = annotation_field 5680 5681 if ( 5682 force_update_annotation 5683 or annotation_fields_new_name not in self.get_header().infos 5684 ): 5685 annotation_list.append(annotation_field) 5686 annotation_renamed_list.append(annotation_fields_new_name) 5687 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5688 log.warning( 5689 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5690 ) 5691 5692 # Add rename info 5693 run_parallel_commands( 5694 [ 5695 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5696 ], 5697 1, 5698 ) 5699 5700 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5701 log.debug("annotation_list: " + str(annotation_list)) 5702 5703 # protocol 5704 protocol = annotation 5705 5706 # argument 5707 argument = "" 5708 5709 # operation 5710 operation = "f" 5711 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5712 "ensGene" 5713 ): 5714 operation = "g" 5715 if options.get("genebase", None): 5716 argument = f"""'{options.get("genebase","")}'""" 5717 elif annotation in ["cytoBand"]: 5718 operation = "r" 5719 5720 # argument option 5721 argument_option = "" 5722 if argument != "": 5723 argument_option = " --argument " + argument 5724 5725 # command options 5726 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5727 for option in options: 5728 if option not in ["genebase"]: 5729 command_options += f""" --{option}={options[option]}""" 5730 5731 # Command 5732 5733 # Command - Annovar 5734 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5735 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5736 5737 # Command - start pipe 5738 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5741 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5742 5743 # Command - Special characters (refGene annotation) 5744 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5745 5746 # Command - Clean empty fields (with value ".") 5747 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5748 5749 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5750 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5751 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5752 # for ann in annotation_renamed_list: 5753 for ann in annotation_list: 5754 annovar_fields_to_keep.append(f"^INFO/{ann}") 5755 5756 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5757 5758 # Command - indexing 5759 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5760 5761 log.debug(f"Annotation - Annovar command: {command_annovar}") 5762 run_parallel_commands([command_annovar], 1) 5763 5764 # Error messages 5765 log.info(f"Error/Warning messages:") 5766 error_message_command_all = [] 5767 error_message_command_warning = [] 5768 error_message_command_err = [] 5769 for err_file in err_files: 5770 with open(err_file, "r") as f: 5771 for line in f: 5772 message = line.strip() 5773 error_message_command_all.append(message) 5774 if line.startswith("[W::") or line.startswith("WARNING"): 5775 error_message_command_warning.append(message) 5776 if line.startswith("[E::") or line.startswith("ERROR"): 5777 error_message_command_err.append( 5778 f"{err_file}: " + message 5779 ) 5780 # log info 5781 for message in list( 5782 set(error_message_command_err + error_message_command_warning) 5783 ): 5784 log.info(f" {message}") 5785 # debug info 5786 for message in list(set(error_message_command_all)): 5787 log.debug(f" {message}") 5788 # failed 5789 if len(error_message_command_err): 5790 log.error("Annotation failed: Error in commands") 5791 raise ValueError("Annotation failed: Error in commands") 5792 5793 if tmp_annotates_vcf_name_list: 5794 5795 # List of annotated files 5796 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5797 5798 # Tmp file 5799 tmp_annotate_vcf = NamedTemporaryFile( 5800 prefix=self.get_prefix(), 5801 dir=self.get_tmp_dir(), 5802 suffix=".vcf.gz", 5803 delete=False, 5804 ) 5805 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5806 tmp_files.append(tmp_annotate_vcf_name) 5807 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5808 err_files.append(tmp_annotate_vcf_name_err) 5809 tmp_files.append(tmp_annotate_vcf_name_err) 5810 5811 # Command merge 5812 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5813 log.info( 5814 f"Annotation Annovar - Annotation merging " 5815 + str(len(tmp_annotates_vcf_name_list)) 5816 + " annotated files" 5817 ) 5818 log.debug(f"Annotation - merge command: {merge_command}") 5819 run_parallel_commands([merge_command], 1) 5820 5821 # Find annotation in header 5822 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5823 header_list = self.read_vcf_header(f) 5824 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5825 5826 for ann in annovar_vcf_header.infos: 5827 if ann not in self.get_header().infos: 5828 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5829 5830 # Update variants 5831 log.info(f"Annotation Annovar - Updating...") 5832 self.update_from_vcf(tmp_annotate_vcf_name) 5833 5834 # Clean files 5835 # Tmp file remove command 5836 if True: 5837 tmp_files_remove_command = "" 5838 if tmp_files: 5839 tmp_files_remove_command = " ".join(tmp_files) 5840 clean_command = f" rm -f {tmp_files_remove_command} " 5841 log.debug(f"Annotation Annovar - Annotation cleaning ") 5842 log.debug(f"Annotation - cleaning command: {clean_command}") 5843 run_parallel_commands([clean_command], 1) 5844 5845 # Parquet 5846 def annotation_parquet(self, threads: int = None) -> None: 5847 """ 5848 It takes a VCF file, and annotates it with a parquet file 5849 5850 :param threads: number of threads to use for the annotation 5851 :return: the value of the variable "result". 5852 """ 5853 5854 # DEBUG 5855 log.debug("Start annotation with parquet databases") 5856 5857 # Threads 5858 if not threads: 5859 threads = self.get_threads() 5860 log.debug("Threads: " + str(threads)) 5861 5862 # DEBUG 5863 delete_tmp = True 5864 if self.get_config().get("verbosity", "warning") in ["debug"]: 5865 delete_tmp = False 5866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5867 5868 # Config 5869 databases_folders = set( 5870 self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("annotations", ["."]) 5874 + self.get_config() 5875 .get("folders", {}) 5876 .get("databases", {}) 5877 .get("parquet", ["."]) 5878 ) 5879 log.debug("Databases annotations: " + str(databases_folders)) 5880 5881 # Param 5882 annotations = ( 5883 self.get_param() 5884 .get("annotation", {}) 5885 .get("parquet", {}) 5886 .get("annotations", None) 5887 ) 5888 log.debug("Annotations: " + str(annotations)) 5889 5890 # Assembly 5891 assembly = self.get_param().get( 5892 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5893 ) 5894 5895 # Force Update Annotation 5896 force_update_annotation = ( 5897 self.get_param() 5898 .get("annotation", {}) 5899 .get("options", {}) 5900 .get("annotations_update", False) 5901 ) 5902 log.debug(f"force_update_annotation={force_update_annotation}") 5903 force_append_annotation = ( 5904 self.get_param() 5905 .get("annotation", {}) 5906 .get("options", {}) 5907 .get("annotations_append", False) 5908 ) 5909 log.debug(f"force_append_annotation={force_append_annotation}") 5910 5911 # Data 5912 table_variants = self.get_table_variants() 5913 5914 # Check if not empty 5915 log.debug("Check if not empty") 5916 sql_query_chromosomes_df = self.get_query_to_df( 5917 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5918 ) 5919 if not sql_query_chromosomes_df["count"][0]: 5920 log.info(f"VCF empty") 5921 return 5922 5923 # VCF header 5924 vcf_reader = self.get_header() 5925 log.debug("Initial header: " + str(vcf_reader.infos)) 5926 5927 # Nb Variants POS 5928 log.debug("NB Variants Start") 5929 nb_variants = self.conn.execute( 5930 f"SELECT count(*) AS count FROM variants" 5931 ).fetchdf()["count"][0] 5932 log.debug("NB Variants Stop") 5933 5934 # Existing annotations 5935 for vcf_annotation in self.get_header().infos: 5936 5937 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5938 log.debug( 5939 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5940 ) 5941 5942 # Added columns 5943 added_columns = [] 5944 5945 # drop indexes 5946 log.debug(f"Drop indexes...") 5947 self.drop_indexes() 5948 5949 if annotations: 5950 5951 if "ALL" in annotations: 5952 5953 all_param = annotations.get("ALL", {}) 5954 all_param_formats = all_param.get("formats", None) 5955 all_param_releases = all_param.get("releases", None) 5956 5957 databases_infos_dict = self.scan_databases( 5958 database_formats=all_param_formats, 5959 database_releases=all_param_releases, 5960 ) 5961 for database_infos in databases_infos_dict.keys(): 5962 if database_infos not in annotations: 5963 annotations[database_infos] = {"INFO": None} 5964 5965 for annotation in annotations: 5966 5967 if annotation in ["ALL"]: 5968 continue 5969 5970 # Annotation Name 5971 annotation_name = os.path.basename(annotation) 5972 5973 # Annotation fields 5974 annotation_fields = annotations[annotation] 5975 if not annotation_fields: 5976 annotation_fields = {"INFO": None} 5977 5978 log.debug(f"Annotation '{annotation_name}'") 5979 log.debug( 5980 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5981 ) 5982 5983 # Create Database 5984 database = Database( 5985 database=annotation, 5986 databases_folders=databases_folders, 5987 assembly=assembly, 5988 ) 5989 5990 # Find files 5991 parquet_file = database.get_database() 5992 parquet_hdr_file = database.get_header_file() 5993 parquet_type = database.get_type() 5994 5995 # Check if files exists 5996 if not parquet_file or not parquet_hdr_file: 5997 msg_err_list = [] 5998 if not parquet_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file not found" 6001 ) 6002 if parquet_file and not parquet_hdr_file: 6003 msg_err_list.append( 6004 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6005 ) 6006 6007 log.error(". ".join(msg_err_list)) 6008 raise ValueError(". ".join(msg_err_list)) 6009 else: 6010 # Get parquet connexion 6011 parquet_sql_attach = database.get_sql_database_attach( 6012 output="query" 6013 ) 6014 if parquet_sql_attach: 6015 self.conn.execute(parquet_sql_attach) 6016 parquet_file_link = database.get_sql_database_link() 6017 # Log 6018 log.debug( 6019 f"Annotation '{annotation_name}' - file: " 6020 + str(parquet_file) 6021 + " and " 6022 + str(parquet_hdr_file) 6023 ) 6024 6025 # Database full header columns 6026 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6027 parquet_hdr_file 6028 ) 6029 # Log 6030 log.debug( 6031 "Annotation database header columns : " 6032 + str(parquet_hdr_vcf_header_columns) 6033 ) 6034 6035 # Load header as VCF object 6036 parquet_hdr_vcf_header_infos = database.get_header().infos 6037 # Log 6038 log.debug( 6039 "Annotation database header: " 6040 + str(parquet_hdr_vcf_header_infos) 6041 ) 6042 6043 # Get extra infos 6044 parquet_columns = database.get_extra_columns() 6045 # Log 6046 log.debug("Annotation database Columns: " + str(parquet_columns)) 6047 6048 # Add extra columns if "ALL" in annotation_fields 6049 # if "ALL" in annotation_fields: 6050 # allow_add_extra_column = True 6051 if "ALL" in annotation_fields and database.get_extra_columns(): 6052 for extra_column in database.get_extra_columns(): 6053 if ( 6054 extra_column not in annotation_fields 6055 and extra_column.replace("INFO/", "") 6056 not in parquet_hdr_vcf_header_infos 6057 ): 6058 parquet_hdr_vcf_header_infos[extra_column] = ( 6059 vcf.parser._Info( 6060 extra_column, 6061 ".", 6062 "String", 6063 f"{extra_column} description", 6064 "unknown", 6065 "unknown", 6066 self.code_type_map["String"], 6067 ) 6068 ) 6069 6070 # For all fields in database 6071 annotation_fields_all = False 6072 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6073 annotation_fields_all = True 6074 annotation_fields = { 6075 key: key for key in parquet_hdr_vcf_header_infos 6076 } 6077 6078 log.debug( 6079 "Annotation database header - All annotations added: " 6080 + str(annotation_fields) 6081 ) 6082 6083 # Init 6084 6085 # List of annotation fields to use 6086 sql_query_annotation_update_info_sets = [] 6087 6088 # List of annotation to agregate 6089 sql_query_annotation_to_agregate = [] 6090 6091 # Number of fields 6092 nb_annotation_field = 0 6093 6094 # Annotation fields processed 6095 annotation_fields_processed = [] 6096 6097 # Columns mapping 6098 map_columns = database.map_columns( 6099 columns=annotation_fields, prefixes=["INFO/"] 6100 ) 6101 6102 # Query dict for fields to remove (update option) 6103 query_dict_remove = {} 6104 6105 # Fetch Anotation fields 6106 for annotation_field in annotation_fields: 6107 6108 # annotation_field_column 6109 annotation_field_column = map_columns.get( 6110 annotation_field, "INFO" 6111 ) 6112 6113 # field new name, if parametered 6114 annotation_fields_new_name = annotation_fields.get( 6115 annotation_field, annotation_field 6116 ) 6117 if not annotation_fields_new_name: 6118 annotation_fields_new_name = annotation_field 6119 6120 # To annotate 6121 # force_update_annotation = True 6122 # force_append_annotation = True 6123 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6124 if annotation_field in parquet_hdr_vcf_header_infos and ( 6125 force_update_annotation 6126 or force_append_annotation 6127 or ( 6128 annotation_fields_new_name 6129 not in self.get_header().infos 6130 ) 6131 ): 6132 6133 # Add field to annotation to process list 6134 annotation_fields_processed.append( 6135 annotation_fields_new_name 6136 ) 6137 6138 # explode infos for the field 6139 annotation_fields_new_name_info_msg = "" 6140 if ( 6141 force_update_annotation 6142 and annotation_fields_new_name 6143 in self.get_header().infos 6144 ): 6145 # Remove field from INFO 6146 query = f""" 6147 UPDATE {table_variants} as table_variants 6148 SET INFO = REGEXP_REPLACE( 6149 concat(table_variants.INFO,''), 6150 ';*{annotation_fields_new_name}=[^;]*', 6151 '' 6152 ) 6153 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6154 """ 6155 annotation_fields_new_name_info_msg = " [update]" 6156 query_dict_remove[ 6157 f"remove 'INFO/{annotation_fields_new_name}'" 6158 ] = query 6159 6160 # Sep between fields in INFO 6161 nb_annotation_field += 1 6162 if nb_annotation_field > 1: 6163 annotation_field_sep = ";" 6164 else: 6165 annotation_field_sep = "" 6166 6167 log.info( 6168 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6169 ) 6170 6171 # Add INFO field to header 6172 parquet_hdr_vcf_header_infos_number = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].num 6174 or "." 6175 ) 6176 parquet_hdr_vcf_header_infos_type = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].type 6178 or "String" 6179 ) 6180 parquet_hdr_vcf_header_infos_description = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].desc 6182 or f"{annotation_field} description" 6183 ) 6184 parquet_hdr_vcf_header_infos_source = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].source 6186 or "unknown" 6187 ) 6188 parquet_hdr_vcf_header_infos_version = ( 6189 parquet_hdr_vcf_header_infos[annotation_field].version 6190 or "unknown" 6191 ) 6192 6193 vcf_reader.infos[annotation_fields_new_name] = ( 6194 vcf.parser._Info( 6195 annotation_fields_new_name, 6196 parquet_hdr_vcf_header_infos_number, 6197 parquet_hdr_vcf_header_infos_type, 6198 parquet_hdr_vcf_header_infos_description, 6199 parquet_hdr_vcf_header_infos_source, 6200 parquet_hdr_vcf_header_infos_version, 6201 self.code_type_map[ 6202 parquet_hdr_vcf_header_infos_type 6203 ], 6204 ) 6205 ) 6206 6207 # Append 6208 if force_append_annotation: 6209 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6210 else: 6211 query_case_when_append = "" 6212 6213 # Annotation/Update query fields 6214 # Found in INFO column 6215 if ( 6216 annotation_field_column == "INFO" 6217 and "INFO" in parquet_hdr_vcf_header_columns 6218 ): 6219 sql_query_annotation_update_info_sets.append( 6220 f""" 6221 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6222 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6223 ELSE '' 6224 END 6225 """ 6226 ) 6227 # Found in a specific column 6228 else: 6229 sql_query_annotation_update_info_sets.append( 6230 f""" 6231 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6232 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6233 ELSE '' 6234 END 6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate 6242 else: 6243 6244 if force_update_annotation: 6245 annotation_message = "forced" 6246 else: 6247 annotation_message = "skipped" 6248 6249 if annotation_field not in parquet_hdr_vcf_header_infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6252 ) 6253 if annotation_fields_new_name in self.get_header().infos: 6254 log.warning( 6255 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6256 ) 6257 6258 # Check if ALL fields have to be annotated. Thus concat all INFO field 6259 # allow_annotation_full_info = True 6260 allow_annotation_full_info = not force_append_annotation 6261 6262 if parquet_type in ["regions"]: 6263 allow_annotation_full_info = False 6264 6265 if ( 6266 allow_annotation_full_info 6267 and nb_annotation_field == len(annotation_fields) 6268 and annotation_fields_all 6269 and ( 6270 "INFO" in parquet_hdr_vcf_header_columns 6271 and "INFO" in database.get_extra_columns() 6272 ) 6273 ): 6274 log.debug("Column INFO annotation enabled") 6275 sql_query_annotation_update_info_sets = [] 6276 sql_query_annotation_update_info_sets.append( 6277 f" table_parquet.INFO " 6278 ) 6279 6280 if sql_query_annotation_update_info_sets: 6281 6282 # Annotate 6283 log.info(f"Annotation '{annotation_name}' - Annotation...") 6284 6285 # Join query annotation update info sets for SQL 6286 sql_query_annotation_update_info_sets_sql = ",".join( 6287 sql_query_annotation_update_info_sets 6288 ) 6289 6290 # Check chromosomes list (and variants infos) 6291 sql_query_chromosomes = f""" 6292 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6293 FROM {table_variants} as table_variants 6294 GROUP BY table_variants."#CHROM" 6295 ORDER BY table_variants."#CHROM" 6296 """ 6297 sql_query_chromosomes_df = self.conn.execute( 6298 sql_query_chromosomes 6299 ).df() 6300 sql_query_chromosomes_dict = { 6301 entry["CHROM"]: { 6302 "count": entry["count_variants"], 6303 "min": entry["min_variants"], 6304 "max": entry["max_variants"], 6305 } 6306 for index, entry in sql_query_chromosomes_df.iterrows() 6307 } 6308 6309 # Init 6310 nb_of_query = 0 6311 nb_of_variant_annotated = 0 6312 query_dict = query_dict_remove 6313 6314 # for chrom in sql_query_chromosomes_df["CHROM"]: 6315 for chrom in sql_query_chromosomes_dict: 6316 6317 # Number of variant by chromosome 6318 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6319 chrom, {} 6320 ).get("count", 0) 6321 6322 log.debug( 6323 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6324 ) 6325 6326 # Annotation with regions database 6327 if parquet_type in ["regions"]: 6328 sql_query_annotation_from_clause = f""" 6329 FROM ( 6330 SELECT 6331 '{chrom}' AS \"#CHROM\", 6332 table_variants_from.\"POS\" AS \"POS\", 6333 {",".join(sql_query_annotation_to_agregate)} 6334 FROM {table_variants} as table_variants_from 6335 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6336 table_parquet_from."#CHROM" = '{chrom}' 6337 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6338 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6339 ) 6340 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6341 GROUP BY table_variants_from.\"POS\" 6342 ) 6343 as table_parquet 6344 """ 6345 6346 sql_query_annotation_where_clause = """ 6347 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6348 AND table_parquet.\"POS\" = table_variants.\"POS\" 6349 """ 6350 6351 # Annotation with variants database 6352 else: 6353 sql_query_annotation_from_clause = f""" 6354 FROM {parquet_file_link} as table_parquet 6355 """ 6356 sql_query_annotation_where_clause = f""" 6357 table_variants."#CHROM" = '{chrom}' 6358 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6359 AND table_parquet.\"POS\" = table_variants.\"POS\" 6360 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6361 AND table_parquet.\"REF\" = table_variants.\"REF\" 6362 """ 6363 6364 # Create update query 6365 sql_query_annotation_chrom_interval_pos = f""" 6366 UPDATE {table_variants} as table_variants 6367 SET INFO = 6368 concat( 6369 CASE WHEN table_variants.INFO NOT IN ('','.') 6370 THEN table_variants.INFO 6371 ELSE '' 6372 END 6373 , 6374 CASE WHEN table_variants.INFO NOT IN ('','.') 6375 AND ( 6376 concat({sql_query_annotation_update_info_sets_sql}) 6377 ) 6378 NOT IN ('','.') 6379 THEN ';' 6380 ELSE '' 6381 END 6382 , 6383 {sql_query_annotation_update_info_sets_sql} 6384 ) 6385 {sql_query_annotation_from_clause} 6386 WHERE {sql_query_annotation_where_clause} 6387 ; 6388 """ 6389 6390 # Add update query to dict 6391 query_dict[ 6392 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6393 ] = sql_query_annotation_chrom_interval_pos 6394 6395 nb_of_query = len(query_dict) 6396 num_query = 0 6397 6398 # SET max_expression_depth TO x 6399 self.conn.execute("SET max_expression_depth TO 10000") 6400 6401 for query_name in query_dict: 6402 query = query_dict[query_name] 6403 num_query += 1 6404 log.info( 6405 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6406 ) 6407 result = self.conn.execute(query) 6408 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6409 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6412 ) 6413 6414 log.info( 6415 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6416 ) 6417 6418 else: 6419 6420 log.info( 6421 f"Annotation '{annotation_name}' - No Annotations available" 6422 ) 6423 6424 log.debug("Final header: " + str(vcf_reader.infos)) 6425 6426 # Remove added columns 6427 for added_column in added_columns: 6428 self.drop_column(column=added_column) 6429 6430 def annotation_splice(self, threads: int = None) -> None: 6431 """ 6432 This function annotate with snpEff 6433 6434 :param threads: The number of threads to use 6435 :return: the value of the variable "return_value". 6436 """ 6437 6438 # DEBUG 6439 log.debug("Start annotation with splice tools") 6440 6441 # Threads 6442 if not threads: 6443 threads = self.get_threads() 6444 log.debug("Threads: " + str(threads)) 6445 6446 # DEBUG 6447 delete_tmp = True 6448 if self.get_config().get("verbosity", "warning") in ["debug"]: 6449 delete_tmp = False 6450 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6451 6452 # Config 6453 config = self.get_config() 6454 log.debug("Config: " + str(config)) 6455 splice_config = config.get("tools", {}).get("splice", {}) 6456 if not splice_config: 6457 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6458 msg_err = "No Splice tool config" 6459 raise ValueError(msg_err) 6460 log.debug(f"splice_config: {splice_config}") 6461 6462 # Config - Folders - Databases 6463 databases_folders = ( 6464 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6465 ) 6466 log.debug("Databases annotations: " + str(databases_folders)) 6467 6468 # Splice docker image 6469 splice_docker_image = splice_config.get("docker").get("image") 6470 6471 # Pull splice image if it's not already there 6472 if not check_docker_image_exists(splice_docker_image): 6473 log.warning( 6474 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6475 ) 6476 try: 6477 command(f"docker pull {splice_config.get('docker').get('image')}") 6478 except subprocess.CalledProcessError: 6479 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6480 log.error(msg_err) 6481 raise ValueError(msg_err) 6482 6483 # Config - splice databases 6484 splice_databases = ( 6485 config.get("folders", {}) 6486 .get("databases", {}) 6487 .get("splice", DEFAULT_SPLICE_FOLDER) 6488 ) 6489 splice_databases = full_path(splice_databases) 6490 6491 # Param 6492 param = self.get_param() 6493 log.debug("Param: " + str(param)) 6494 6495 # Param 6496 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6497 log.debug("Options: " + str(options)) 6498 6499 # Data 6500 table_variants = self.get_table_variants() 6501 6502 # Check if not empty 6503 log.debug("Check if not empty") 6504 sql_query_chromosomes = ( 6505 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6506 ) 6507 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6508 log.info("VCF empty") 6509 return None 6510 6511 # Export in VCF 6512 log.debug("Create initial file to annotate") 6513 6514 # Create output folder / work folder 6515 if options.get("output_folder", ""): 6516 output_folder = options.get("output_folder", "") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 else: 6520 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6521 if not os.path.exists(output_folder): 6522 Path(output_folder).mkdir(parents=True, exist_ok=True) 6523 6524 if options.get("workdir", ""): 6525 workdir = options.get("workdir", "") 6526 else: 6527 workdir = "/work" 6528 6529 # Create tmp VCF file 6530 tmp_vcf = NamedTemporaryFile( 6531 prefix=self.get_prefix(), 6532 dir=output_folder, 6533 suffix=".vcf", 6534 delete=False, 6535 ) 6536 tmp_vcf_name = tmp_vcf.name 6537 6538 # VCF header 6539 header = self.get_header() 6540 6541 # Existing annotations 6542 for vcf_annotation in self.get_header().infos: 6543 6544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6545 log.debug( 6546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6547 ) 6548 6549 # Memory limit 6550 if config.get("memory", None): 6551 memory_limit = config.get("memory", "8G").upper() 6552 # upper() 6553 else: 6554 memory_limit = "8G" 6555 log.debug(f"memory_limit: {memory_limit}") 6556 6557 # Check number of variants to annotate 6558 where_clause_regex_spliceai = r"SpliceAI_\w+" 6559 where_clause_regex_spip = r"SPiP_\w+" 6560 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6561 df_list_of_variants_to_annotate = self.get_query_to_df( 6562 query=f""" SELECT * FROM variants {where_clause} """ 6563 ) 6564 if len(df_list_of_variants_to_annotate) == 0: 6565 log.warning( 6566 f"No variants to annotate with splice. Variants probably already annotated with splice" 6567 ) 6568 return None 6569 else: 6570 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6571 6572 # Export VCF file 6573 self.export_variant_vcf( 6574 vcf_file=tmp_vcf_name, 6575 remove_info=True, 6576 add_samples=True, 6577 index=False, 6578 where_clause=where_clause, 6579 ) 6580 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6581 if any(value for value in splice_config.values() if value is None): 6582 log.warning("At least one splice config parameter is empty") 6583 # exit annotation_splice 6584 return None 6585 6586 # Params in splice nf 6587 def check_values(dico: dict): 6588 """ 6589 Ensure parameters for NF splice pipeline 6590 """ 6591 for key, val in dico.items(): 6592 if key == "genome": 6593 if any( 6594 assemb in options.get("genome", {}) 6595 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6596 ): 6597 yield f"--{key} hg19" 6598 elif any( 6599 assemb in options.get("genome", {}) 6600 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6601 ): 6602 yield f"--{key} hg38" 6603 elif ( 6604 (isinstance(val, str) and val) 6605 or isinstance(val, int) 6606 or isinstance(val, bool) 6607 ): 6608 yield f"--{key} {val}" 6609 6610 # Genome 6611 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6612 options["genome"] = genome 6613 # NF params 6614 nf_params = [] 6615 # Add options 6616 if options: 6617 log.debug(options) 6618 nf_params = list(check_values(options)) 6619 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6620 else: 6621 log.debug("No NF params provided") 6622 # Add threads 6623 if "threads" not in options.keys(): 6624 nf_params.append(f"--threads {threads}") 6625 # Genome path 6626 genome_path = find_genome( 6627 config.get("folders", {}) 6628 .get("databases", {}) 6629 .get("genomes", DEFAULT_GENOME_FOLDER), 6630 file=f"{genome}.fa", 6631 ) 6632 # Add genome path 6633 if not genome_path: 6634 raise ValueError( 6635 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6636 ) 6637 else: 6638 log.debug(f"Genome: {genome_path}") 6639 nf_params.append(f"--genome_path {genome_path}") 6640 6641 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6642 """ 6643 Setting up updated databases for SPiP and SpliceAI 6644 """ 6645 6646 try: 6647 6648 # SpliceAI assembly transcriptome 6649 spliceai_assembly = os.path.join( 6650 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6651 options.get("genome"), 6652 "transcriptome", 6653 ) 6654 spip_assembly = options.get("genome") 6655 6656 spip = find( 6657 f"transcriptome_{spip_assembly}.RData", 6658 config.get("folders", {}).get("databases", {}).get("spip", {}), 6659 ) 6660 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6661 log.debug(f"SPiP annotations: {spip}") 6662 log.debug(f"SpliceAI annotations: {spliceai}") 6663 if spip and spliceai: 6664 return [ 6665 f"--spip_transcriptome {spip}", 6666 f"--spliceai_transcriptome {spliceai}", 6667 ] 6668 else: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 except TypeError: 6673 log.warning( 6674 "Can't find splice databases in configuration, use annotations file from image" 6675 ) 6676 return [] 6677 6678 # Add options, check if transcriptome option have already beend provided 6679 if ( 6680 "spip_transcriptome" not in nf_params 6681 and "spliceai_transcriptome" not in nf_params 6682 ): 6683 splice_reference = splice_annotations(options, config) 6684 if splice_reference: 6685 nf_params.extend(splice_reference) 6686 # nf_params.append(f"--output_folder {output_folder}") 6687 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6688 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6689 log.debug(cmd) 6690 splice_config["docker"]["command"] = cmd 6691 6692 # Ensure proxy is set 6693 proxy = [ 6694 f"-e {var}={os.getenv(var)}" 6695 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6696 if os.getenv(var) is not None 6697 ] 6698 docker_cmd = get_bin_command( 6699 tool="splice", 6700 bin_type="docker", 6701 config=config, 6702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6703 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6704 ) 6705 # print(docker_cmd) 6706 # exit() 6707 # Docker debug 6708 # if splice_config.get("rm_container"): 6709 # rm_container = "--rm" 6710 # else: 6711 # rm_container = "" 6712 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6713 log.debug(docker_cmd) 6714 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6715 log.debug(res.stdout) 6716 if res.stderr: 6717 log.error(res.stderr) 6718 res.check_returncode() 6719 # Update variants 6720 log.info("Annotation - Updating...") 6721 # Test find output vcf 6722 log.debug( 6723 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6724 ) 6725 output_vcf = [] 6726 # Wrong folder to look in 6727 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6728 if ( 6729 files 6730 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6731 ): 6732 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6733 # log.debug(os.listdir(options.get("output_folder"))) 6734 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6735 if not output_vcf: 6736 log.debug( 6737 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6738 ) 6739 else: 6740 # Get new header from annotated vcf 6741 log.debug(f"Initial header: {len(header.infos)} fields") 6742 # Create new header with splice infos 6743 new_vcf = Variants(input=output_vcf[0]) 6744 new_vcf_header = new_vcf.get_header().infos 6745 for keys, infos in new_vcf_header.items(): 6746 if keys not in header.infos.keys(): 6747 header.infos[keys] = infos 6748 log.debug(f"New header: {len(header.infos)} fields") 6749 log.debug(f"Splice tmp output: {output_vcf[0]}") 6750 self.update_from_vcf(output_vcf[0]) 6751 6752 # Remove file 6753 remove_if_exists(output_vcf) 6754 6755 ### 6756 # Prioritization 6757 ### 6758 6759 def get_config_default(self, name: str) -> dict: 6760 """ 6761 The function `get_config_default` returns a dictionary containing default configurations for 6762 various calculations and prioritizations. 6763 6764 :param name: The `get_config_default` function returns a dictionary containing default 6765 configurations for different calculations and prioritizations. The `name` parameter is used to 6766 specify which specific configuration to retrieve from the dictionary 6767 :type name: str 6768 :return: The function `get_config_default` returns a dictionary containing default configuration 6769 settings for different calculations and prioritizations. The specific configuration settings are 6770 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6771 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6772 returned. If there is no match, an empty dictionary is returned. 6773 """ 6774 6775 config_default = { 6776 "calculations": { 6777 "variant_chr_pos_alt_ref": { 6778 "type": "sql", 6779 "name": "variant_chr_pos_alt_ref", 6780 "description": "Create a variant ID with chromosome, position, alt and ref", 6781 "available": False, 6782 "output_column_name": "variant_chr_pos_alt_ref", 6783 "output_column_type": "String", 6784 "output_column_description": "variant ID with chromosome, position, alt and ref", 6785 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6786 "operation_info": True, 6787 }, 6788 "VARTYPE": { 6789 "type": "sql", 6790 "name": "VARTYPE", 6791 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6792 "available": True, 6793 "table": "variants", 6794 "output_column_name": "VARTYPE", 6795 "output_column_type": "String", 6796 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6797 "operation_query": """ 6798 CASE 6799 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6800 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6801 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6802 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6803 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6804 ELSE 'UNDEFINED' 6805 END 6806 """, 6807 "info_fields": ["SVTYPE"], 6808 "operation_info": True, 6809 }, 6810 "snpeff_hgvs": { 6811 "type": "python", 6812 "name": "snpeff_hgvs", 6813 "description": "HGVS nomenclatures from snpEff annotation", 6814 "available": True, 6815 "function_name": "calculation_extract_snpeff_hgvs", 6816 "function_params": ["snpeff_hgvs", "ANN"], 6817 }, 6818 "snpeff_ann_explode": { 6819 "type": "python", 6820 "name": "snpeff_ann_explode", 6821 "description": "Explode snpEff annotations with uniquify values", 6822 "available": True, 6823 "function_name": "calculation_snpeff_ann_explode", 6824 "function_params": [False, "fields", "snpeff_", "ANN"], 6825 }, 6826 "snpeff_ann_explode_uniquify": { 6827 "type": "python", 6828 "name": "snpeff_ann_explode_uniquify", 6829 "description": "Explode snpEff annotations", 6830 "available": True, 6831 "function_name": "calculation_snpeff_ann_explode", 6832 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6833 }, 6834 "snpeff_ann_explode_json": { 6835 "type": "python", 6836 "name": "snpeff_ann_explode_json", 6837 "description": "Explode snpEff annotations in JSON format", 6838 "available": True, 6839 "function_name": "calculation_snpeff_ann_explode", 6840 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6841 }, 6842 "NOMEN": { 6843 "type": "python", 6844 "name": "NOMEN", 6845 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6846 "available": True, 6847 "function_name": "calculation_extract_nomen", 6848 "function_params": [], 6849 }, 6850 "RENAME_INFO_FIELDS": { 6851 "type": "python", 6852 "name": "RENAME_INFO_FIELDS", 6853 "description": "Rename or remove INFO/tags", 6854 "available": True, 6855 "function_name": "calculation_rename_info_fields", 6856 "function_params": [], 6857 }, 6858 "FINDBYPIPELINE": { 6859 "type": "python", 6860 "name": "FINDBYPIPELINE", 6861 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6862 "available": True, 6863 "function_name": "calculation_find_by_pipeline", 6864 "function_params": ["findbypipeline"], 6865 }, 6866 "FINDBYSAMPLE": { 6867 "type": "python", 6868 "name": "FINDBYSAMPLE", 6869 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6870 "available": True, 6871 "function_name": "calculation_find_by_pipeline", 6872 "function_params": ["findbysample"], 6873 }, 6874 "GENOTYPECONCORDANCE": { 6875 "type": "python", 6876 "name": "GENOTYPECONCORDANCE", 6877 "description": "Concordance of genotype for multi caller VCF", 6878 "available": True, 6879 "function_name": "calculation_genotype_concordance", 6880 "function_params": [], 6881 }, 6882 "BARCODE": { 6883 "type": "python", 6884 "name": "BARCODE", 6885 "description": "BARCODE as VaRank tool", 6886 "available": True, 6887 "function_name": "calculation_barcode", 6888 "function_params": [], 6889 }, 6890 "BARCODEFAMILY": { 6891 "type": "python", 6892 "name": "BARCODEFAMILY", 6893 "description": "BARCODEFAMILY as VaRank tool", 6894 "available": True, 6895 "function_name": "calculation_barcode_family", 6896 "function_params": ["BCF"], 6897 }, 6898 "TRIO": { 6899 "type": "python", 6900 "name": "TRIO", 6901 "description": "Inheritance for a trio family", 6902 "available": True, 6903 "function_name": "calculation_trio", 6904 "function_params": [], 6905 }, 6906 "VAF": { 6907 "type": "python", 6908 "name": "VAF", 6909 "description": "Variant Allele Frequency (VAF) harmonization", 6910 "available": True, 6911 "function_name": "calculation_vaf_normalization", 6912 "function_params": [], 6913 }, 6914 "VAF_stats": { 6915 "type": "python", 6916 "name": "VAF_stats", 6917 "description": "Variant Allele Frequency (VAF) statistics", 6918 "available": True, 6919 "function_name": "calculation_genotype_stats", 6920 "function_params": ["VAF"], 6921 }, 6922 "DP_stats": { 6923 "type": "python", 6924 "name": "DP_stats", 6925 "description": "Depth (DP) statistics", 6926 "available": True, 6927 "function_name": "calculation_genotype_stats", 6928 "function_params": ["DP"], 6929 }, 6930 "variant_id": { 6931 "type": "python", 6932 "name": "variant_id", 6933 "description": "Variant ID generated from variant position and type", 6934 "available": True, 6935 "function_name": "calculation_variant_id", 6936 "function_params": [], 6937 }, 6938 "transcripts_json": { 6939 "type": "python", 6940 "name": "transcripts_json", 6941 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6942 "available": True, 6943 "function_name": "calculation_transcripts_annotation", 6944 "function_params": ["transcripts_json", None], 6945 }, 6946 "transcripts_ann": { 6947 "type": "python", 6948 "name": "transcripts_ann", 6949 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6950 "available": True, 6951 "function_name": "calculation_transcripts_annotation", 6952 "function_params": [None, "transcripts_ann"], 6953 }, 6954 "transcripts_annotations": { 6955 "type": "python", 6956 "name": "transcripts_annotations", 6957 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6958 "available": True, 6959 "function_name": "calculation_transcripts_annotation", 6960 "function_params": [None, None], 6961 }, 6962 "transcripts_prioritization": { 6963 "type": "python", 6964 "name": "transcripts_prioritization", 6965 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6966 "available": True, 6967 "function_name": "calculation_transcripts_prioritization", 6968 "function_params": [], 6969 }, 6970 "transcripts_export": { 6971 "type": "python", 6972 "name": "transcripts_export", 6973 "description": "Export transcripts table/view as a file (using param.json)", 6974 "available": True, 6975 "function_name": "calculation_transcripts_export", 6976 "function_params": [], 6977 }, 6978 }, 6979 "prioritizations": { 6980 "default": { 6981 "ANN2": [ 6982 { 6983 "type": "contains", 6984 "value": "HIGH", 6985 "score": 5, 6986 "flag": "PASS", 6987 "comment": [ 6988 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6989 ], 6990 }, 6991 { 6992 "type": "contains", 6993 "value": "MODERATE", 6994 "score": 3, 6995 "flag": "PASS", 6996 "comment": [ 6997 "A non-disruptive variant that might change protein effectiveness" 6998 ], 6999 }, 7000 { 7001 "type": "contains", 7002 "value": "LOW", 7003 "score": 0, 7004 "flag": "FILTERED", 7005 "comment": [ 7006 "Assumed to be mostly harmless or unlikely to change protein behavior" 7007 ], 7008 }, 7009 { 7010 "type": "contains", 7011 "value": "MODIFIER", 7012 "score": 0, 7013 "flag": "FILTERED", 7014 "comment": [ 7015 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7016 ], 7017 }, 7018 ], 7019 } 7020 }, 7021 } 7022 7023 return config_default.get(name, None) 7024 7025 def get_config_json( 7026 self, name: str, config_dict: dict = {}, config_file: str = None 7027 ) -> dict: 7028 """ 7029 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7030 default values, a dictionary, and a file. 7031 7032 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7033 the name of the configuration. It is used to identify and retrieve the configuration settings 7034 for a specific component or module 7035 :type name: str 7036 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7037 dictionary that allows you to provide additional configuration settings or overrides. When you 7038 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7039 the key is the configuration setting you want to override or 7040 :type config_dict: dict 7041 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7042 specify the path to a configuration file that contains additional settings. If provided, the 7043 function will read the contents of this file and update the configuration dictionary with the 7044 values found in the file, overriding any existing values with the 7045 :type config_file: str 7046 :return: The function `get_config_json` returns a dictionary containing the configuration 7047 settings. 7048 """ 7049 7050 # Create with default prioritizations 7051 config_default = self.get_config_default(name=name) 7052 configuration = config_default 7053 # log.debug(f"configuration={configuration}") 7054 7055 # Replace prioritizations from dict 7056 for config in config_dict: 7057 configuration[config] = config_dict[config] 7058 7059 # Replace prioritizations from file 7060 config_file = full_path(config_file) 7061 if config_file: 7062 if os.path.exists(config_file): 7063 with open(config_file) as config_file_content: 7064 config_file_dict = yaml.safe_load(config_file_content) 7065 for config in config_file_dict: 7066 configuration[config] = config_file_dict[config] 7067 else: 7068 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7069 log.error(msg_error) 7070 raise ValueError(msg_error) 7071 7072 return configuration 7073 7074 def prioritization( 7075 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7076 ) -> bool: 7077 """ 7078 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7079 prioritizes variants based on configured profiles and criteria. 7080 7081 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7082 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7083 a table name is provided, the method will prioritize the variants in that specific table 7084 :type table: str 7085 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7086 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7087 provided, the code will use a default prefix value of "PZ" 7088 :type pz_prefix: str 7089 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7090 additional parameters specific to the prioritization process. These parameters can include 7091 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7092 configurations needed for the prioritization of variants in a V 7093 :type pz_param: dict 7094 :return: A boolean value (True) is being returned from the `prioritization` function. 7095 """ 7096 7097 # Config 7098 config = self.get_config() 7099 7100 # Param 7101 param = self.get_param() 7102 7103 # Prioritization param 7104 if pz_param is not None: 7105 prioritization_param = pz_param 7106 else: 7107 prioritization_param = param.get("prioritization", {}) 7108 7109 # Configuration profiles 7110 prioritization_config_file = prioritization_param.get( 7111 "prioritization_config", None 7112 ) 7113 prioritization_config_file = full_path(prioritization_config_file) 7114 prioritizations_config = self.get_config_json( 7115 name="prioritizations", config_file=prioritization_config_file 7116 ) 7117 7118 # Prioritization prefix 7119 pz_prefix_default = "PZ" 7120 if pz_prefix is None: 7121 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7122 7123 # Prioritization options 7124 profiles = prioritization_param.get("profiles", []) 7125 if isinstance(profiles, str): 7126 profiles = profiles.split(",") 7127 pzfields = prioritization_param.get( 7128 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7129 ) 7130 if isinstance(pzfields, str): 7131 pzfields = pzfields.split(",") 7132 default_profile = prioritization_param.get("default_profile", None) 7133 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7134 prioritization_score_mode = prioritization_param.get( 7135 "prioritization_score_mode", "HOWARD" 7136 ) 7137 7138 # Quick Prioritizations 7139 prioritizations = param.get("prioritizations", None) 7140 if prioritizations: 7141 log.info("Quick Prioritization:") 7142 for profile in prioritizations.split(","): 7143 if profile not in profiles: 7144 profiles.append(profile) 7145 log.info(f" {profile}") 7146 7147 # If profile "ALL" provided, all profiles in the config profiles 7148 if "ALL" in profiles: 7149 profiles = list(prioritizations_config.keys()) 7150 7151 for profile in profiles: 7152 if prioritizations_config.get(profile, None): 7153 log.debug(f"Profile '{profile}' configured") 7154 else: 7155 msg_error = f"Profile '{profile}' NOT configured" 7156 log.error(msg_error) 7157 raise ValueError(msg_error) 7158 7159 if profiles: 7160 log.info(f"Prioritization... ") 7161 else: 7162 log.debug(f"No profile defined") 7163 return False 7164 7165 if not default_profile and len(profiles): 7166 default_profile = profiles[0] 7167 7168 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7169 log.debug("Profiles to check: " + str(list(profiles))) 7170 7171 # Variables 7172 if table is not None: 7173 table_variants = table 7174 else: 7175 table_variants = self.get_table_variants(clause="update") 7176 log.debug(f"Table to prioritize: {table_variants}") 7177 7178 # Added columns 7179 added_columns = [] 7180 7181 # Create list of PZfields 7182 # List of PZFields 7183 list_of_pzfields_original = pzfields + [ 7184 pzfield + pzfields_sep + profile 7185 for pzfield in pzfields 7186 for profile in profiles 7187 ] 7188 list_of_pzfields = [] 7189 log.debug(f"{list_of_pzfields_original}") 7190 7191 # Remove existing PZfields to use if exists 7192 for pzfield in list_of_pzfields_original: 7193 if self.get_header().infos.get(pzfield, None) is None: 7194 list_of_pzfields.append(pzfield) 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7196 else: 7197 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7198 7199 if list_of_pzfields: 7200 7201 # Explode Infos prefix 7202 explode_infos_prefix = self.get_explode_infos_prefix() 7203 7204 # PZfields tags description 7205 PZfields_INFOS = { 7206 f"{pz_prefix}Tags": { 7207 "ID": f"{pz_prefix}Tags", 7208 "Number": ".", 7209 "Type": "String", 7210 "Description": "Variant tags based on annotation criteria", 7211 }, 7212 f"{pz_prefix}Score": { 7213 "ID": f"{pz_prefix}Score", 7214 "Number": 1, 7215 "Type": "Integer", 7216 "Description": "Variant score based on annotation criteria", 7217 }, 7218 f"{pz_prefix}Flag": { 7219 "ID": f"{pz_prefix}Flag", 7220 "Number": 1, 7221 "Type": "String", 7222 "Description": "Variant flag based on annotation criteria", 7223 }, 7224 f"{pz_prefix}Comment": { 7225 "ID": f"{pz_prefix}Comment", 7226 "Number": ".", 7227 "Type": "String", 7228 "Description": "Variant comment based on annotation criteria", 7229 }, 7230 f"{pz_prefix}Infos": { 7231 "ID": f"{pz_prefix}Infos", 7232 "Number": ".", 7233 "Type": "String", 7234 "Description": "Variant infos based on annotation criteria", 7235 }, 7236 f"{pz_prefix}Class": { 7237 "ID": f"{pz_prefix}Class", 7238 "Number": ".", 7239 "Type": "String", 7240 "Description": "Variant class based on annotation criteria", 7241 }, 7242 } 7243 7244 # Create INFO fields if not exist 7245 for field in PZfields_INFOS: 7246 field_ID = PZfields_INFOS[field]["ID"] 7247 field_description = PZfields_INFOS[field]["Description"] 7248 if field_ID not in self.get_header().infos and field_ID in pzfields: 7249 field_description = ( 7250 PZfields_INFOS[field]["Description"] 7251 + f", profile {default_profile}" 7252 ) 7253 self.get_header().infos[field_ID] = vcf.parser._Info( 7254 field_ID, 7255 PZfields_INFOS[field]["Number"], 7256 PZfields_INFOS[field]["Type"], 7257 field_description, 7258 "unknown", 7259 "unknown", 7260 code_type_map[PZfields_INFOS[field]["Type"]], 7261 ) 7262 7263 # Create INFO fields if not exist for each profile 7264 for profile in prioritizations_config: 7265 if profile in profiles or profiles == []: 7266 for field in PZfields_INFOS: 7267 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7268 field_description = ( 7269 PZfields_INFOS[field]["Description"] 7270 + f", profile {profile}" 7271 ) 7272 if ( 7273 field_ID not in self.get_header().infos 7274 and field in pzfields 7275 ): 7276 self.get_header().infos[field_ID] = vcf.parser._Info( 7277 field_ID, 7278 PZfields_INFOS[field]["Number"], 7279 PZfields_INFOS[field]["Type"], 7280 field_description, 7281 "unknown", 7282 "unknown", 7283 code_type_map[PZfields_INFOS[field]["Type"]], 7284 ) 7285 7286 # Header 7287 for pzfield in list_of_pzfields: 7288 if re.match(f"{pz_prefix}Score.*", pzfield): 7289 added_column = self.add_column( 7290 table_name=table_variants, 7291 column_name=pzfield, 7292 column_type="INTEGER", 7293 default_value="0", 7294 ) 7295 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7296 added_column = self.add_column( 7297 table_name=table_variants, 7298 column_name=pzfield, 7299 column_type="BOOLEAN", 7300 default_value="1", 7301 ) 7302 elif re.match(f"{pz_prefix}Class.*", pzfield): 7303 added_column = self.add_column( 7304 table_name=table_variants, 7305 column_name=pzfield, 7306 column_type="VARCHAR[]", 7307 default_value="null", 7308 ) 7309 else: 7310 added_column = self.add_column( 7311 table_name=table_variants, 7312 column_name=pzfield, 7313 column_type="STRING", 7314 default_value="''", 7315 ) 7316 added_columns.append(added_column) 7317 7318 # Profiles 7319 if profiles: 7320 7321 # foreach profile in configuration file 7322 for profile in prioritizations_config: 7323 7324 # If profile is asked in param, or ALL are asked (empty profile []) 7325 if profile in profiles or profiles == []: 7326 log.info(f"Profile '{profile}'") 7327 7328 sql_set_info_option = "" 7329 7330 sql_set_info = [] 7331 7332 # PZ fields set 7333 7334 # PZScore 7335 if ( 7336 f"{pz_prefix}Score{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Score{pzfields_sep}{profile}=', 7343 {pz_prefix}Score{pzfields_sep}{profile} 7344 ) 7345 """ 7346 ) 7347 if ( 7348 profile == default_profile 7349 and f"{pz_prefix}Score" in list_of_pzfields 7350 ): 7351 sql_set_info.append( 7352 f""" 7353 concat( 7354 '{pz_prefix}Score=', 7355 {pz_prefix}Score{pzfields_sep}{profile} 7356 ) 7357 """ 7358 ) 7359 7360 # PZFlag 7361 if ( 7362 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 sql_set_info.append( 7366 f""" 7367 concat( 7368 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7369 CASE 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7371 THEN 'PASS' 7372 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7373 THEN 'FILTERED' 7374 END 7375 ) 7376 """ 7377 ) 7378 if ( 7379 profile == default_profile 7380 and f"{pz_prefix}Flag" in list_of_pzfields 7381 ): 7382 sql_set_info.append( 7383 f""" 7384 concat( 7385 '{pz_prefix}Flag=', 7386 CASE 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7388 THEN 'PASS' 7389 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7390 THEN 'FILTERED' 7391 END 7392 ) 7393 """ 7394 ) 7395 7396 # PZClass 7397 if ( 7398 f"{pz_prefix}Class{pzfields_sep}{profile}" 7399 in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 concat( 7404 '{pz_prefix}Class{pzfields_sep}{profile}=', 7405 CASE 7406 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7407 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7408 ELSE '.' 7409 END 7410 ) 7411 7412 """ 7413 ) 7414 if ( 7415 profile == default_profile 7416 and f"{pz_prefix}Class" in list_of_pzfields 7417 ): 7418 sql_set_info.append( 7419 f""" 7420 concat( 7421 '{pz_prefix}Class=', 7422 CASE 7423 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7424 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7425 ELSE '.' 7426 END 7427 ) 7428 """ 7429 ) 7430 7431 # PZComment 7432 if ( 7433 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7434 in list_of_pzfields 7435 ): 7436 sql_set_info.append( 7437 f""" 7438 CASE 7439 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7440 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7441 ELSE '' 7442 END 7443 """ 7444 ) 7445 if ( 7446 profile == default_profile 7447 and f"{pz_prefix}Comment" in list_of_pzfields 7448 ): 7449 sql_set_info.append( 7450 f""" 7451 CASE 7452 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7453 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7454 ELSE '' 7455 END 7456 """ 7457 ) 7458 7459 # PZInfos 7460 if ( 7461 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7462 in list_of_pzfields 7463 ): 7464 sql_set_info.append( 7465 f""" 7466 CASE 7467 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7468 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7469 ELSE '' 7470 END 7471 """ 7472 ) 7473 if ( 7474 profile == default_profile 7475 and f"{pz_prefix}Infos" in list_of_pzfields 7476 ): 7477 sql_set_info.append( 7478 f""" 7479 CASE 7480 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7481 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7482 ELSE '' 7483 END 7484 """ 7485 ) 7486 7487 # Merge PZfields 7488 sql_set_info_option = "" 7489 sql_set_sep = "" 7490 for sql_set in sql_set_info: 7491 if sql_set_sep: 7492 sql_set_info_option += f""" 7493 , concat('{sql_set_sep}', {sql_set}) 7494 """ 7495 else: 7496 sql_set_info_option += f""" 7497 , {sql_set} 7498 """ 7499 sql_set_sep = ";" 7500 7501 sql_queries = [] 7502 for annotation in prioritizations_config[profile]: 7503 7504 # skip special sections 7505 if annotation.startswith("_"): 7506 continue 7507 7508 # For each criterions 7509 for criterion in prioritizations_config[profile][ 7510 annotation 7511 ]: 7512 7513 # Criterion mode 7514 criterion_mode = None 7515 if np.any( 7516 np.isin(list(criterion.keys()), ["type", "value"]) 7517 ): 7518 criterion_mode = "operation" 7519 elif np.any( 7520 np.isin(list(criterion.keys()), ["sql", "fields"]) 7521 ): 7522 criterion_mode = "sql" 7523 log.debug(f"Criterion Mode: {criterion_mode}") 7524 7525 # Criterion parameters 7526 criterion_type = criterion.get("type", None) 7527 criterion_value = criterion.get("value", None) 7528 criterion_sql = criterion.get("sql", None) 7529 criterion_fields = criterion.get("fields", None) 7530 criterion_score = criterion.get("score", 0) 7531 criterion_flag = criterion.get("flag", "PASS") 7532 criterion_class = criterion.get("class", None) 7533 criterion_flag_bool = criterion_flag == "PASS" 7534 criterion_comment = ( 7535 ", ".join(criterion.get("comment", [])) 7536 .replace("'", "''") 7537 .replace(";", ",") 7538 .replace("\t", " ") 7539 ) 7540 criterion_infos = ( 7541 str(criterion) 7542 .replace("'", "''") 7543 .replace(";", ",") 7544 .replace("\t", " ") 7545 ) 7546 7547 # SQL 7548 if criterion_sql is not None and isinstance( 7549 criterion_sql, list 7550 ): 7551 criterion_sql = " ".join(criterion_sql) 7552 7553 # Fields and explode 7554 if criterion_fields is None: 7555 criterion_fields = [annotation] 7556 if not isinstance(criterion_fields, list): 7557 criterion_fields = str(criterion_fields).split(",") 7558 7559 # Class 7560 if criterion_class is not None and not isinstance( 7561 criterion_class, list 7562 ): 7563 criterion_class = str(criterion_class).split(",") 7564 7565 for annotation_field in criterion_fields: 7566 7567 # Explode specific annotation 7568 log.debug( 7569 f"Explode annotation '{annotation_field}'" 7570 ) 7571 added_columns += self.explode_infos( 7572 prefix=explode_infos_prefix, 7573 fields=[annotation_field], 7574 table=table_variants, 7575 ) 7576 extra_infos = self.get_extra_infos( 7577 table=table_variants 7578 ) 7579 7580 # Check if annotation field is present 7581 if ( 7582 f"{explode_infos_prefix}{annotation_field}" 7583 not in extra_infos 7584 ): 7585 msq_err = f"Annotation '{annotation_field}' not in data" 7586 log.error(msq_err) 7587 raise ValueError(msq_err) 7588 else: 7589 log.debug( 7590 f"Annotation '{annotation_field}' in data" 7591 ) 7592 7593 sql_set = [] 7594 sql_set_info = [] 7595 7596 # PZ fields set 7597 7598 # PZScore 7599 if ( 7600 f"{pz_prefix}Score{pzfields_sep}{profile}" 7601 in list_of_pzfields 7602 ): 7603 # VaRank prioritization score mode 7604 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7605 sql_set.append( 7606 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7607 ) 7608 # default HOWARD prioritization score mode 7609 else: 7610 sql_set.append( 7611 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7612 ) 7613 7614 # PZFlag 7615 if ( 7616 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7617 in list_of_pzfields 7618 ): 7619 sql_set.append( 7620 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7621 ) 7622 7623 # PZClass 7624 if ( 7625 f"{pz_prefix}Class{pzfields_sep}{profile}" 7626 in list_of_pzfields 7627 and criterion_class is not None 7628 ): 7629 sql_set.append( 7630 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7631 ) 7632 7633 # PZComment 7634 if ( 7635 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7636 in list_of_pzfields 7637 ): 7638 sql_set.append( 7639 f""" 7640 {pz_prefix}Comment{pzfields_sep}{profile} = 7641 concat( 7642 {pz_prefix}Comment{pzfields_sep}{profile}, 7643 CASE 7644 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7645 THEN ', ' 7646 ELSE '' 7647 END, 7648 '{criterion_comment}' 7649 ) 7650 """ 7651 ) 7652 7653 # PZInfos 7654 if ( 7655 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7656 in list_of_pzfields 7657 ): 7658 sql_set.append( 7659 f""" 7660 {pz_prefix}Infos{pzfields_sep}{profile} = 7661 concat( 7662 {pz_prefix}Infos{pzfields_sep}{profile}, 7663 '{criterion_infos}' 7664 ) 7665 """ 7666 ) 7667 sql_set_option = ",".join(sql_set) 7668 7669 # Criterion and comparison 7670 if sql_set_option: 7671 7672 if criterion_mode in ["operation"]: 7673 7674 try: 7675 float(criterion_value) 7676 sql_update = f""" 7677 UPDATE {table_variants} 7678 SET {sql_set_option} 7679 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7680 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7681 """ 7682 except: 7683 contains_option = "" 7684 if criterion_type == "contains": 7685 contains_option = ".*" 7686 sql_update = f""" 7687 UPDATE {table_variants} 7688 SET {sql_set_option} 7689 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7690 """ 7691 sql_queries.append(sql_update) 7692 7693 elif criterion_mode in ["sql"]: 7694 7695 sql_update = f""" 7696 UPDATE {table_variants} 7697 SET {sql_set_option} 7698 WHERE {criterion_sql} 7699 """ 7700 sql_queries.append(sql_update) 7701 7702 else: 7703 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7704 log.error(msg_err) 7705 raise ValueError(msg_err) 7706 7707 else: 7708 log.warning( 7709 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7710 ) 7711 7712 # PZTags 7713 if ( 7714 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7715 in list_of_pzfields 7716 ): 7717 7718 # Create PZFalgs value 7719 pztags_value = "" 7720 pztags_sep_default = "," 7721 pztags_sep = "" 7722 for pzfield in pzfields: 7723 if pzfield not in [f"{pz_prefix}Tags"]: 7724 if ( 7725 f"{pzfield}{pzfields_sep}{profile}" 7726 in list_of_pzfields 7727 ): 7728 if pzfield in [f"{pz_prefix}Flag"]: 7729 pztags_value += f"""{pztags_sep}{pzfield}#', 7730 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7731 THEN 'PASS' 7732 ELSE 'FILTERED' 7733 END, '""" 7734 elif pzfield in [f"{pz_prefix}Class"]: 7735 pztags_value += f"""{pztags_sep}{pzfield}#', 7736 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7737 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7738 ELSE '.' 7739 END, '""" 7740 else: 7741 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7742 pztags_sep = pztags_sep_default 7743 7744 # Add Query update for PZFlags 7745 sql_update_pztags = f""" 7746 UPDATE {table_variants} 7747 SET INFO = concat( 7748 INFO, 7749 CASE WHEN INFO NOT in ('','.') 7750 THEN ';' 7751 ELSE '' 7752 END, 7753 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7754 ) 7755 """ 7756 sql_queries.append(sql_update_pztags) 7757 7758 # Add Query update for PZFlags for default 7759 if profile == default_profile: 7760 sql_update_pztags_default = f""" 7761 UPDATE {table_variants} 7762 SET INFO = concat( 7763 INFO, 7764 ';', 7765 '{pz_prefix}Tags={pztags_value}' 7766 ) 7767 """ 7768 sql_queries.append(sql_update_pztags_default) 7769 7770 log.info(f"""Profile '{profile}' - Prioritization... """) 7771 7772 if sql_queries: 7773 7774 for sql_query in sql_queries: 7775 log.debug( 7776 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7777 ) 7778 self.conn.execute(sql_query) 7779 7780 log.info(f"""Profile '{profile}' - Update... """) 7781 sql_query_update = f""" 7782 UPDATE {table_variants} 7783 SET INFO = 7784 concat( 7785 CASE 7786 WHEN INFO NOT IN ('','.') 7787 THEN concat(INFO, ';') 7788 ELSE '' 7789 END 7790 {sql_set_info_option} 7791 ) 7792 """ 7793 self.conn.execute(sql_query_update) 7794 7795 else: 7796 7797 log.warning(f"No profiles in parameters") 7798 7799 # Remove added columns 7800 for added_column in added_columns: 7801 self.drop_column(column=added_column) 7802 7803 # Explode INFOS fields into table fields 7804 if self.get_explode_infos(): 7805 self.explode_infos( 7806 prefix=self.get_explode_infos_prefix(), 7807 fields=self.get_explode_infos_fields(), 7808 force=True, 7809 ) 7810 7811 return True 7812 7813 ### 7814 # HGVS 7815 ### 7816 7817 def annotation_hgvs(self, threads: int = None) -> None: 7818 """ 7819 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7820 coordinates and alleles. 7821 7822 :param threads: The `threads` parameter is an optional integer that specifies the number of 7823 threads to use for parallel processing. If no value is provided, it will default to the number 7824 of threads obtained from the `get_threads()` method 7825 :type threads: int 7826 """ 7827 7828 # Function for each partition of the Dask Dataframe 7829 def partition_function(partition): 7830 """ 7831 The function `partition_function` applies the `annotation_hgvs_partition` function to 7832 each row of a DataFrame called `partition`. 7833 7834 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7835 to be processed 7836 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7837 the "partition" dataframe along the axis 1. 7838 """ 7839 return partition.apply(annotation_hgvs_partition, axis=1) 7840 7841 def annotation_hgvs_partition(row) -> str: 7842 """ 7843 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7844 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7845 7846 :param row: A dictionary-like object that contains the values for the following keys: 7847 :return: a string that contains the HGVS names associated with the given row of data. 7848 """ 7849 7850 chr = row["CHROM"] 7851 pos = row["POS"] 7852 ref = row["REF"] 7853 alt = row["ALT"] 7854 7855 # Find list of associated transcripts 7856 transcripts_list = list( 7857 polars_conn.execute( 7858 f""" 7859 SELECT transcript 7860 FROM refseq_df 7861 WHERE CHROM='{chr}' 7862 AND POS={pos} 7863 """ 7864 )["transcript"] 7865 ) 7866 7867 # Full HGVS annotation in list 7868 hgvs_full_list = [] 7869 7870 for transcript_name in transcripts_list: 7871 7872 # Transcript 7873 transcript = get_transcript( 7874 transcripts=transcripts, transcript_name=transcript_name 7875 ) 7876 # Exon 7877 if use_exon: 7878 exon = transcript.find_exon_number(pos) 7879 else: 7880 exon = None 7881 # Protein 7882 transcript_protein = None 7883 if use_protein or add_protein or full_format: 7884 transcripts_protein = list( 7885 polars_conn.execute( 7886 f""" 7887 SELECT protein 7888 FROM refseqlink_df 7889 WHERE transcript='{transcript_name}' 7890 LIMIT 1 7891 """ 7892 )["protein"] 7893 ) 7894 if len(transcripts_protein): 7895 transcript_protein = transcripts_protein[0] 7896 7897 # HGVS name 7898 hgvs_name = format_hgvs_name( 7899 chr, 7900 pos, 7901 ref, 7902 alt, 7903 genome=genome, 7904 transcript=transcript, 7905 transcript_protein=transcript_protein, 7906 exon=exon, 7907 use_gene=use_gene, 7908 use_protein=use_protein, 7909 full_format=full_format, 7910 use_version=use_version, 7911 codon_type=codon_type, 7912 ) 7913 hgvs_full_list.append(hgvs_name) 7914 if add_protein and not use_protein and not full_format: 7915 hgvs_name = format_hgvs_name( 7916 chr, 7917 pos, 7918 ref, 7919 alt, 7920 genome=genome, 7921 transcript=transcript, 7922 transcript_protein=transcript_protein, 7923 exon=exon, 7924 use_gene=use_gene, 7925 use_protein=True, 7926 full_format=False, 7927 use_version=use_version, 7928 codon_type=codon_type, 7929 ) 7930 hgvs_full_list.append(hgvs_name) 7931 7932 # Create liste of HGVS annotations 7933 hgvs_full = ",".join(hgvs_full_list) 7934 7935 return hgvs_full 7936 7937 # Polars connexion 7938 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7939 7940 # Config 7941 config = self.get_config() 7942 7943 # Databases 7944 # Genome 7945 databases_genomes_folders = ( 7946 config.get("folders", {}) 7947 .get("databases", {}) 7948 .get("genomes", DEFAULT_GENOME_FOLDER) 7949 ) 7950 databases_genome = ( 7951 config.get("folders", {}).get("databases", {}).get("genomes", "") 7952 ) 7953 # refseq database folder 7954 databases_refseq_folders = ( 7955 config.get("folders", {}) 7956 .get("databases", {}) 7957 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7958 ) 7959 # refseq 7960 databases_refseq = config.get("databases", {}).get("refSeq", None) 7961 # refSeqLink 7962 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7963 7964 # Param 7965 param = self.get_param() 7966 7967 # Quick HGVS 7968 if "hgvs_options" in param and param.get("hgvs_options", ""): 7969 log.info(f"Quick HGVS Annotation:") 7970 if not param.get("hgvs", None): 7971 param["hgvs"] = {} 7972 for option in param.get("hgvs_options", "").split(","): 7973 option_var_val = option.split("=") 7974 option_var = option_var_val[0] 7975 if len(option_var_val) > 1: 7976 option_val = option_var_val[1] 7977 else: 7978 option_val = "True" 7979 if option_val.upper() in ["TRUE"]: 7980 option_val = True 7981 elif option_val.upper() in ["FALSE"]: 7982 option_val = False 7983 log.info(f" {option_var}={option_val}") 7984 param["hgvs"][option_var] = option_val 7985 7986 # Check if HGVS annotation enabled 7987 if "hgvs" in param: 7988 log.info(f"HGVS Annotation... ") 7989 for hgvs_option in param.get("hgvs", {}): 7990 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7991 else: 7992 return 7993 7994 # HGVS Param 7995 param_hgvs = param.get("hgvs", {}) 7996 use_exon = param_hgvs.get("use_exon", False) 7997 use_gene = param_hgvs.get("use_gene", False) 7998 use_protein = param_hgvs.get("use_protein", False) 7999 add_protein = param_hgvs.get("add_protein", False) 8000 full_format = param_hgvs.get("full_format", False) 8001 use_version = param_hgvs.get("use_version", False) 8002 codon_type = param_hgvs.get("codon_type", "3") 8003 8004 # refSseq refSeqLink 8005 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8006 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8007 8008 # Assembly 8009 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8010 8011 # Genome 8012 genome_file = None 8013 if find_genome(databases_genome): 8014 genome_file = find_genome(databases_genome) 8015 else: 8016 genome_file = find_genome( 8017 genome_path=databases_genomes_folders, assembly=assembly 8018 ) 8019 log.debug("Genome: " + str(genome_file)) 8020 8021 # refSseq 8022 refseq_file = find_file_prefix( 8023 input_file=databases_refseq, 8024 prefix="ncbiRefSeq", 8025 folder=databases_refseq_folders, 8026 assembly=assembly, 8027 ) 8028 log.debug("refSeq: " + str(refseq_file)) 8029 8030 # refSeqLink 8031 refseqlink_file = find_file_prefix( 8032 input_file=databases_refseqlink, 8033 prefix="ncbiRefSeqLink", 8034 folder=databases_refseq_folders, 8035 assembly=assembly, 8036 ) 8037 log.debug("refSeqLink: " + str(refseqlink_file)) 8038 8039 # Threads 8040 if not threads: 8041 threads = self.get_threads() 8042 log.debug("Threads: " + str(threads)) 8043 8044 # Variables 8045 table_variants = self.get_table_variants(clause="update") 8046 8047 # Get variants SNV and InDel only 8048 query_variants = f""" 8049 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8050 FROM {table_variants} 8051 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8052 """ 8053 df_variants = self.get_query_to_df(query_variants) 8054 8055 # Added columns 8056 added_columns = [] 8057 8058 # Add hgvs column in variants table 8059 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8060 added_column = self.add_column( 8061 table_variants, hgvs_column_name, "STRING", default_value=None 8062 ) 8063 added_columns.append(added_column) 8064 8065 log.debug(f"refSeq loading...") 8066 # refSeq in duckDB 8067 refseq_table = get_refseq_table( 8068 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8069 ) 8070 # Loading all refSeq in Dataframe 8071 refseq_query = f""" 8072 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8073 FROM {refseq_table} 8074 JOIN df_variants ON ( 8075 {refseq_table}.chrom = df_variants.CHROM 8076 AND {refseq_table}.txStart<=df_variants.POS 8077 AND {refseq_table}.txEnd>=df_variants.POS 8078 ) 8079 """ 8080 refseq_df = self.conn.query(refseq_query).pl() 8081 8082 if refseqlink_file: 8083 log.debug(f"refSeqLink loading...") 8084 # refSeqLink in duckDB 8085 refseqlink_table = get_refseq_table( 8086 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8087 ) 8088 # Loading all refSeqLink in Dataframe 8089 protacc_column = "protAcc_with_ver" 8090 mrnaacc_column = "mrnaAcc_with_ver" 8091 refseqlink_query = f""" 8092 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8093 FROM {refseqlink_table} 8094 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8095 WHERE protAcc_without_ver IS NOT NULL 8096 """ 8097 # Polars Dataframe 8098 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8099 8100 # Read RefSeq transcripts into a python dict/model. 8101 log.debug(f"Transcripts loading...") 8102 with tempfile.TemporaryDirectory() as tmpdir: 8103 transcripts_query = f""" 8104 COPY ( 8105 SELECT {refseq_table}.* 8106 FROM {refseq_table} 8107 JOIN df_variants ON ( 8108 {refseq_table}.chrom=df_variants.CHROM 8109 AND {refseq_table}.txStart<=df_variants.POS 8110 AND {refseq_table}.txEnd>=df_variants.POS 8111 ) 8112 ) 8113 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8114 """ 8115 self.conn.query(transcripts_query) 8116 with open(f"{tmpdir}/transcript.tsv") as infile: 8117 transcripts = read_transcripts(infile) 8118 8119 # Polars connexion 8120 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8121 8122 log.debug("Genome loading...") 8123 # Read genome sequence using pyfaidx. 8124 genome = Fasta(genome_file) 8125 8126 log.debug("Start annotation HGVS...") 8127 8128 # Create 8129 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8130 ddf = dd.from_pandas(df_variants, npartitions=threads) 8131 8132 # Use dask.dataframe.apply() to apply function on each partition 8133 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8134 8135 # Convert Dask DataFrame to Pandas Dataframe 8136 df = ddf.compute() 8137 8138 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8139 with tempfile.TemporaryDirectory() as tmpdir: 8140 df_parquet = os.path.join(tmpdir, "df.parquet") 8141 df.to_parquet(df_parquet) 8142 8143 # Update hgvs column 8144 update_variant_query = f""" 8145 UPDATE {table_variants} 8146 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8147 FROM read_parquet('{df_parquet}') as df 8148 WHERE variants."#CHROM" = df.CHROM 8149 AND variants.POS = df.POS 8150 AND variants.REF = df.REF 8151 AND variants.ALT = df.ALT 8152 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8153 """ 8154 self.execute_query(update_variant_query) 8155 8156 # Update INFO column 8157 sql_query_update = f""" 8158 UPDATE {table_variants} 8159 SET INFO = 8160 concat( 8161 CASE 8162 WHEN INFO NOT IN ('','.') 8163 THEN concat(INFO, ';') 8164 ELSE '' 8165 END, 8166 'hgvs=', 8167 {hgvs_column_name} 8168 ) 8169 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8170 """ 8171 self.execute_query(sql_query_update) 8172 8173 # Add header 8174 HGVS_INFOS = { 8175 "hgvs": { 8176 "ID": "hgvs", 8177 "Number": ".", 8178 "Type": "String", 8179 "Description": f"HGVS annotatation with HOWARD", 8180 } 8181 } 8182 8183 for field in HGVS_INFOS: 8184 field_ID = HGVS_INFOS[field]["ID"] 8185 field_description = HGVS_INFOS[field]["Description"] 8186 self.get_header().infos[field_ID] = vcf.parser._Info( 8187 field_ID, 8188 HGVS_INFOS[field]["Number"], 8189 HGVS_INFOS[field]["Type"], 8190 field_description, 8191 "unknown", 8192 "unknown", 8193 code_type_map[HGVS_INFOS[field]["Type"]], 8194 ) 8195 8196 # Remove added columns 8197 for added_column in added_columns: 8198 self.drop_column(column=added_column) 8199 8200 ### 8201 # Calculation 8202 ### 8203 8204 def get_operations_help( 8205 self, operations_config_dict: dict = {}, operations_config_file: str = None 8206 ) -> list: 8207 8208 # Init 8209 operations_help = [] 8210 8211 # operations 8212 operations = self.get_config_json( 8213 name="calculations", 8214 config_dict=operations_config_dict, 8215 config_file=operations_config_file, 8216 ) 8217 for op in operations: 8218 op_name = operations[op].get("name", op).upper() 8219 op_description = operations[op].get("description", op_name) 8220 op_available = operations[op].get("available", False) 8221 if op_available: 8222 operations_help.append(f" {op_name}: {op_description}") 8223 8224 # Sort operations 8225 operations_help.sort() 8226 8227 # insert header 8228 operations_help.insert(0, "Available calculation operations:") 8229 8230 # Return 8231 return operations_help 8232 8233 def calculation( 8234 self, 8235 operations: dict = {}, 8236 operations_config_dict: dict = {}, 8237 operations_config_file: str = None, 8238 ) -> None: 8239 """ 8240 It takes a list of operations, and for each operation, it checks if it's a python or sql 8241 operation, and then calls the appropriate function 8242 8243 param json example: 8244 "calculation": { 8245 "NOMEN": { 8246 "options": { 8247 "hgvs_field": "hgvs" 8248 }, 8249 "middle" : null 8250 } 8251 """ 8252 8253 # Param 8254 param = self.get_param() 8255 8256 # CHeck operations config file 8257 if operations_config_file is None: 8258 operations_config_file = param.get("calculation", {}).get( 8259 "calculation_config", None 8260 ) 8261 8262 # operations config 8263 operations_config = self.get_config_json( 8264 name="calculations", 8265 config_dict=operations_config_dict, 8266 config_file=operations_config_file, 8267 ) 8268 8269 # Upper keys 8270 operations_config = {k.upper(): v for k, v in operations_config.items()} 8271 8272 # Calculations 8273 8274 # Operations from param 8275 operations = param.get("calculation", {}).get("calculations", operations) 8276 8277 # Quick calculation - add 8278 if param.get("calculations", None): 8279 8280 # List of operations 8281 calculations_list = [ 8282 value.strip() for value in param.get("calculations", "").split(",") 8283 ] 8284 8285 # Log 8286 log.info(f"Quick Calculations:") 8287 for calculation_key in calculations_list: 8288 log.info(f" {calculation_key}") 8289 8290 # Create tmp operations (to keep operation order) 8291 operations_tmp = {} 8292 for calculation_operation in calculations_list: 8293 if calculation_operation.upper() not in operations_tmp: 8294 log.debug( 8295 f"{calculation_operation}.upper() not in {operations_tmp}" 8296 ) 8297 operations_tmp[calculation_operation.upper()] = {} 8298 add_value_into_dict( 8299 dict_tree=operations_tmp, 8300 sections=[ 8301 calculation_operation.upper(), 8302 ], 8303 value=operations.get(calculation_operation.upper(), {}), 8304 ) 8305 # Add operations already in param 8306 for calculation_operation in operations: 8307 if calculation_operation not in operations_tmp: 8308 operations_tmp[calculation_operation] = operations.get( 8309 calculation_operation, {} 8310 ) 8311 8312 # Update operations in param 8313 operations = operations_tmp 8314 8315 # Operations for calculation 8316 if not operations: 8317 operations = param.get("calculation", {}).get("calculations", {}) 8318 8319 if operations: 8320 log.info(f"Calculations...") 8321 8322 # For each operations 8323 for operation_name in operations: 8324 operation_name = operation_name.upper() 8325 if operation_name not in [""]: 8326 if operation_name in operations_config: 8327 log.info(f"Calculation '{operation_name}'") 8328 operation = operations_config[operation_name] 8329 operation_type = operation.get("type", "sql") 8330 if operation_type == "python": 8331 self.calculation_process_function( 8332 operation=operation, operation_name=operation_name 8333 ) 8334 elif operation_type == "sql": 8335 self.calculation_process_sql( 8336 operation=operation, operation_name=operation_name 8337 ) 8338 else: 8339 log.error( 8340 f"Operations config: Type '{operation_type}' NOT available" 8341 ) 8342 raise ValueError( 8343 f"Operations config: Type '{operation_type}' NOT available" 8344 ) 8345 else: 8346 log.error( 8347 f"Operations config: Calculation '{operation_name}' NOT available" 8348 ) 8349 raise ValueError( 8350 f"Operations config: Calculation '{operation_name}' NOT available" 8351 ) 8352 8353 # Explode INFOS fields into table fields 8354 if self.get_explode_infos(): 8355 self.explode_infos( 8356 prefix=self.get_explode_infos_prefix(), 8357 fields=self.get_explode_infos_fields(), 8358 force=True, 8359 ) 8360 8361 def calculation_process_sql( 8362 self, operation: dict, operation_name: str = "unknown" 8363 ) -> None: 8364 """ 8365 The `calculation_process_sql` function takes in a mathematical operation as a string and 8366 performs the operation, updating the specified table with the result. 8367 8368 :param operation: The `operation` parameter is a dictionary that contains information about the 8369 mathematical operation to be performed. It includes the following keys: 8370 :type operation: dict 8371 :param operation_name: The `operation_name` parameter is a string that represents the name of 8372 the mathematical operation being performed. It is used for logging and error handling purposes, 8373 defaults to unknown 8374 :type operation_name: str (optional) 8375 """ 8376 8377 # Operation infos 8378 operation_name = operation.get("name", "unknown") 8379 log.debug(f"process SQL {operation_name}") 8380 output_column_name = operation.get("output_column_name", operation_name) 8381 output_column_type = operation.get("output_column_type", "String") 8382 prefix = operation.get("explode_infos_prefix", "") 8383 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8384 output_column_description = operation.get( 8385 "output_column_description", f"{operation_name} operation" 8386 ) 8387 operation_query = operation.get("operation_query", None) 8388 if isinstance(operation_query, list): 8389 operation_query = " ".join(operation_query) 8390 operation_info_fields = operation.get("info_fields", []) 8391 operation_info_fields_check = operation.get("info_fields_check", False) 8392 operation_info = operation.get("operation_info", True) 8393 operation_table = operation.get( 8394 "table", self.get_table_variants(clause="alter") 8395 ) 8396 8397 # table variants 8398 if operation_table: 8399 table_variants = operation_table 8400 else: 8401 table_variants = self.get_table_variants(clause="alter") 8402 8403 if operation_query: 8404 8405 # Info fields check 8406 operation_info_fields_check_result = True 8407 if operation_info_fields_check: 8408 header_infos = self.get_header().infos 8409 for info_field in operation_info_fields: 8410 operation_info_fields_check_result = ( 8411 operation_info_fields_check_result 8412 and info_field in header_infos 8413 ) 8414 8415 # If info fields available 8416 if operation_info_fields_check_result: 8417 8418 # Added_columns 8419 added_columns = [] 8420 8421 # Create VCF header field 8422 vcf_reader = self.get_header() 8423 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8424 output_column_name, 8425 ".", 8426 output_column_type, 8427 output_column_description, 8428 "howard calculation", 8429 "0", 8430 self.code_type_map.get(output_column_type), 8431 ) 8432 8433 # Explode infos if needed 8434 log.debug(f"calculation_process_sql prefix {prefix}") 8435 added_columns += self.explode_infos( 8436 prefix=prefix, 8437 fields=[output_column_name] + operation_info_fields, 8438 force=False, 8439 table=table_variants, 8440 ) 8441 8442 # Create column 8443 added_column = self.add_column( 8444 table_name=table_variants, 8445 column_name=prefix + output_column_name, 8446 column_type=output_column_type_sql, 8447 default_value="null", 8448 ) 8449 added_columns.append(added_column) 8450 8451 # Operation calculation 8452 try: 8453 8454 # Query to update calculation column 8455 sql_update = f""" 8456 UPDATE {table_variants} 8457 SET "{prefix}{output_column_name}" = ({operation_query}) 8458 """ 8459 self.conn.execute(sql_update) 8460 8461 # Add to INFO 8462 if operation_info: 8463 sql_update_info = f""" 8464 UPDATE {table_variants} 8465 SET "INFO" = 8466 concat( 8467 CASE 8468 WHEN "INFO" IS NOT NULL 8469 THEN concat("INFO", ';') 8470 ELSE '' 8471 END, 8472 '{output_column_name}=', 8473 "{prefix}{output_column_name}" 8474 ) 8475 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8476 """ 8477 self.conn.execute(sql_update_info) 8478 8479 except: 8480 log.error( 8481 f"Operations config: Calculation '{operation_name}' query failed" 8482 ) 8483 raise ValueError( 8484 f"Operations config: Calculation '{operation_name}' query failed" 8485 ) 8486 8487 # Remove added columns 8488 for added_column in added_columns: 8489 log.debug(f"added_column: {added_column}") 8490 self.drop_column(column=added_column) 8491 8492 else: 8493 log.error( 8494 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8495 ) 8496 raise ValueError( 8497 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8498 ) 8499 8500 else: 8501 log.error( 8502 f"Operations config: Calculation '{operation_name}' query NOT defined" 8503 ) 8504 raise ValueError( 8505 f"Operations config: Calculation '{operation_name}' query NOT defined" 8506 ) 8507 8508 def calculation_process_function( 8509 self, operation: dict, operation_name: str = "unknown" 8510 ) -> None: 8511 """ 8512 The `calculation_process_function` takes in an operation dictionary and performs the specified 8513 function with the given parameters. 8514 8515 :param operation: The `operation` parameter is a dictionary that contains information about the 8516 operation to be performed. It has the following keys: 8517 :type operation: dict 8518 :param operation_name: The `operation_name` parameter is a string that represents the name of 8519 the operation being performed. It is used for logging purposes, defaults to unknown 8520 :type operation_name: str (optional) 8521 """ 8522 8523 operation_name = operation["name"] 8524 log.debug(f"process Python {operation_name}") 8525 function_name = operation["function_name"] 8526 function_params = operation["function_params"] 8527 getattr(self, function_name)(*function_params) 8528 8529 def calculation_variant_id(self) -> None: 8530 """ 8531 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8532 updates the INFO field of a variants table with the variant ID. 8533 """ 8534 8535 # variant_id annotation field 8536 variant_id_tag = self.get_variant_id_column() 8537 added_columns = [variant_id_tag] 8538 8539 # variant_id hgvs tags" 8540 vcf_infos_tags = { 8541 variant_id_tag: "howard variant ID annotation", 8542 } 8543 8544 # Variants table 8545 table_variants = self.get_table_variants() 8546 8547 # Header 8548 vcf_reader = self.get_header() 8549 8550 # Add variant_id to header 8551 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8552 variant_id_tag, 8553 ".", 8554 "String", 8555 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8556 "howard calculation", 8557 "0", 8558 self.code_type_map.get("String"), 8559 ) 8560 8561 # Update 8562 sql_update = f""" 8563 UPDATE {table_variants} 8564 SET "INFO" = 8565 concat( 8566 CASE 8567 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8568 THEN '' 8569 ELSE concat("INFO", ';') 8570 END, 8571 '{variant_id_tag}=', 8572 "{variant_id_tag}" 8573 ) 8574 """ 8575 self.conn.execute(sql_update) 8576 8577 # Remove added columns 8578 for added_column in added_columns: 8579 self.drop_column(column=added_column) 8580 8581 def calculation_extract_snpeff_hgvs( 8582 self, 8583 snpeff_hgvs: str = "snpeff_hgvs", 8584 snpeff_field: str = "ANN", 8585 ) -> None: 8586 """ 8587 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8588 annotation field in a VCF file and adds them as a new column in the variants table. 8589 8590 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8591 function is used to specify the name of the column that will store the HGVS nomenclatures 8592 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8593 snpeff_hgvs 8594 :type snpeff_hgvs: str (optional) 8595 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8596 function represents the field in the VCF file that contains SnpEff annotations. This field is 8597 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8598 to ANN 8599 :type snpeff_field: str (optional) 8600 """ 8601 8602 # Snpeff hgvs tags 8603 vcf_infos_tags = { 8604 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8605 } 8606 8607 # Prefix 8608 prefix = self.get_explode_infos_prefix() 8609 if prefix: 8610 prefix = "INFO/" 8611 8612 # snpEff fields 8613 speff_ann_infos = prefix + snpeff_field 8614 speff_hgvs_infos = prefix + snpeff_hgvs 8615 8616 # Variants table 8617 table_variants = self.get_table_variants() 8618 8619 # Header 8620 vcf_reader = self.get_header() 8621 8622 # Add columns 8623 added_columns = [] 8624 8625 # Explode HGVS field in column 8626 added_columns += self.explode_infos(fields=[snpeff_field]) 8627 8628 if snpeff_field in vcf_reader.infos: 8629 8630 log.debug(vcf_reader.infos[snpeff_field]) 8631 8632 # Extract ANN header 8633 ann_description = vcf_reader.infos[snpeff_field].desc 8634 pattern = r"'(.+?)'" 8635 match = re.search(pattern, ann_description) 8636 if match: 8637 ann_header_match = match.group(1).split(" | ") 8638 ann_header_desc = {} 8639 for i in range(len(ann_header_match)): 8640 ann_header_info = "".join( 8641 char for char in ann_header_match[i] if char.isalnum() 8642 ) 8643 ann_header_desc[ann_header_info] = ann_header_match[i] 8644 if not ann_header_desc: 8645 raise ValueError("Invalid header description format") 8646 else: 8647 raise ValueError("Invalid header description format") 8648 8649 # Create variant id 8650 variant_id_column = self.get_variant_id_column() 8651 added_columns += [variant_id_column] 8652 8653 # Create dataframe 8654 dataframe_snpeff_hgvs = self.get_query_to_df( 8655 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8656 ) 8657 8658 # Create main NOMEN column 8659 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8660 speff_ann_infos 8661 ].apply( 8662 lambda x: extract_snpeff_hgvs( 8663 str(x), header=list(ann_header_desc.values()) 8664 ) 8665 ) 8666 8667 # Add snpeff_hgvs to header 8668 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8669 snpeff_hgvs, 8670 ".", 8671 "String", 8672 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8673 "howard calculation", 8674 "0", 8675 self.code_type_map.get("String"), 8676 ) 8677 8678 # Update 8679 sql_update = f""" 8680 UPDATE variants 8681 SET "INFO" = 8682 concat( 8683 CASE 8684 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8685 THEN '' 8686 ELSE concat("INFO", ';') 8687 END, 8688 CASE 8689 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8690 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8691 THEN concat( 8692 '{snpeff_hgvs}=', 8693 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8694 ) 8695 ELSE '' 8696 END 8697 ) 8698 FROM dataframe_snpeff_hgvs 8699 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8700 8701 """ 8702 self.conn.execute(sql_update) 8703 8704 # Delete dataframe 8705 del dataframe_snpeff_hgvs 8706 gc.collect() 8707 8708 else: 8709 8710 log.warning( 8711 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8712 ) 8713 8714 # Remove added columns 8715 for added_column in added_columns: 8716 self.drop_column(column=added_column) 8717 8718 def calculation_snpeff_ann_explode( 8719 self, 8720 uniquify: bool = True, 8721 output_format: str = "fields", 8722 output_prefix: str = "snpeff_", 8723 snpeff_field: str = "ANN", 8724 ) -> None: 8725 """ 8726 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8727 exploding the HGVS field and updating variant information accordingly. 8728 8729 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8730 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8731 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8732 defaults to True 8733 :type uniquify: bool (optional) 8734 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8735 function specifies the format in which the output annotations will be generated. It has a 8736 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8737 format, defaults to fields 8738 :type output_format: str (optional) 8739 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8740 method is used to specify the prefix that will be added to the output annotations generated 8741 during the calculation process. This prefix helps to differentiate the newly added annotations 8742 from existing ones in the output data. By default, the, defaults to ANN_ 8743 :type output_prefix: str (optional) 8744 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8745 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8746 field will be processed to explode the HGVS annotations and update the variant information 8747 accordingly, defaults to ANN 8748 :type snpeff_field: str (optional) 8749 """ 8750 8751 # SnpEff annotation field 8752 snpeff_hgvs = "snpeff_ann_explode" 8753 8754 # Snpeff hgvs tags 8755 vcf_infos_tags = { 8756 snpeff_hgvs: "Explode snpEff annotations", 8757 } 8758 8759 # Prefix 8760 prefix = self.get_explode_infos_prefix() 8761 if prefix: 8762 prefix = "INFO/" 8763 8764 # snpEff fields 8765 speff_ann_infos = prefix + snpeff_field 8766 speff_hgvs_infos = prefix + snpeff_hgvs 8767 8768 # Variants table 8769 table_variants = self.get_table_variants() 8770 8771 # Header 8772 vcf_reader = self.get_header() 8773 8774 # Add columns 8775 added_columns = [] 8776 8777 # Explode HGVS field in column 8778 added_columns += self.explode_infos(fields=[snpeff_field]) 8779 log.debug(f"snpeff_field={snpeff_field}") 8780 log.debug(f"added_columns={added_columns}") 8781 8782 if snpeff_field in vcf_reader.infos: 8783 8784 # Extract ANN header 8785 ann_description = vcf_reader.infos[snpeff_field].desc 8786 pattern = r"'(.+?)'" 8787 match = re.search(pattern, ann_description) 8788 if match: 8789 ann_header_match = match.group(1).split(" | ") 8790 ann_header = [] 8791 ann_header_desc = {} 8792 for i in range(len(ann_header_match)): 8793 ann_header_info = "".join( 8794 char for char in ann_header_match[i] if char.isalnum() 8795 ) 8796 ann_header.append(ann_header_info) 8797 ann_header_desc[ann_header_info] = ann_header_match[i] 8798 if not ann_header_desc: 8799 raise ValueError("Invalid header description format") 8800 else: 8801 raise ValueError("Invalid header description format") 8802 8803 # Create variant id 8804 variant_id_column = self.get_variant_id_column() 8805 added_columns += [variant_id_column] 8806 8807 # Create dataframe 8808 dataframe_snpeff_hgvs = self.get_query_to_df( 8809 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8810 ) 8811 8812 # Create snpEff columns 8813 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8814 speff_ann_infos 8815 ].apply( 8816 lambda x: explode_snpeff_ann( 8817 str(x), 8818 uniquify=uniquify, 8819 output_format=output_format, 8820 prefix=output_prefix, 8821 header=list(ann_header_desc.values()), 8822 ) 8823 ) 8824 8825 # Header 8826 ann_annotations_prefix = "" 8827 if output_format.upper() in ["JSON"]: 8828 ann_annotations_prefix = f"{output_prefix}=" 8829 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8830 output_prefix, 8831 ".", 8832 "String", 8833 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8834 + " - JSON format", 8835 "howard calculation", 8836 "0", 8837 self.code_type_map.get("String"), 8838 ) 8839 else: 8840 for ann_annotation in ann_header: 8841 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8842 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8843 ann_annotation_id, 8844 ".", 8845 "String", 8846 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8847 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8848 "howard calculation", 8849 "0", 8850 self.code_type_map.get("String"), 8851 ) 8852 8853 # Update 8854 sql_update = f""" 8855 UPDATE variants 8856 SET "INFO" = 8857 concat( 8858 CASE 8859 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8860 THEN '' 8861 ELSE concat("INFO", ';') 8862 END, 8863 CASE 8864 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8865 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8866 THEN concat( 8867 '{ann_annotations_prefix}', 8868 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8869 ) 8870 ELSE '' 8871 END 8872 ) 8873 FROM dataframe_snpeff_hgvs 8874 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8875 8876 """ 8877 self.conn.execute(sql_update) 8878 8879 # Delete dataframe 8880 del dataframe_snpeff_hgvs 8881 gc.collect() 8882 8883 else: 8884 8885 log.warning( 8886 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8887 ) 8888 8889 # Remove added columns 8890 for added_column in added_columns: 8891 self.drop_column(column=added_column) 8892 8893 def calculation_extract_nomen(self) -> None: 8894 """ 8895 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8896 """ 8897 8898 # NOMEN field 8899 field_nomen_dict = "NOMEN_DICT" 8900 8901 # NOMEN structure 8902 nomen_dict = { 8903 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8904 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8905 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8906 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8907 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8908 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8909 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8910 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8911 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8912 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8913 } 8914 8915 # Param 8916 param = self.get_param() 8917 8918 # Threads 8919 threads = self.get_threads() 8920 8921 # Prefix 8922 prefix = self.get_explode_infos_prefix() 8923 8924 # Header 8925 vcf_reader = self.get_header() 8926 8927 # Added columns 8928 added_columns = [] 8929 8930 # Get HGVS field 8931 hgvs_field = ( 8932 param.get("calculation", {}) 8933 .get("calculations", {}) 8934 .get("NOMEN", {}) 8935 .get("options", {}) 8936 .get("hgvs_field", "hgvs") 8937 ) 8938 8939 # Get NOMEN pattern 8940 nomen_pattern = ( 8941 param.get("calculation", {}) 8942 .get("calculations", {}) 8943 .get("NOMEN", {}) 8944 .get("options", {}) 8945 .get("pattern", None) 8946 ) 8947 8948 # transcripts list of preference sources 8949 transcripts_sources = {} 8950 8951 # Get transcripts 8952 transcripts_file = ( 8953 param.get("calculation", {}) 8954 .get("calculations", {}) 8955 .get("NOMEN", {}) 8956 .get("options", {}) 8957 .get("transcripts", None) 8958 ) 8959 transcripts_file = full_path(transcripts_file) 8960 if transcripts_file: 8961 if os.path.exists(transcripts_file): 8962 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8963 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8964 transcripts_sources["file"] = transcripts_from_file 8965 else: 8966 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8967 log.error(msg_err) 8968 raise ValueError(msg_err) 8969 8970 # Get transcripts table 8971 transcripts_table = ( 8972 param.get("calculation", {}) 8973 .get("calculations", {}) 8974 .get("NOMEN", {}) 8975 .get("options", {}) 8976 .get("transcripts_table", self.get_table_variants()) 8977 ) 8978 # Get transcripts column 8979 transcripts_column = ( 8980 param.get("calculation", {}) 8981 .get("calculations", {}) 8982 .get("NOMEN", {}) 8983 .get("options", {}) 8984 .get("transcripts_column", None) 8985 ) 8986 8987 if transcripts_table and transcripts_column: 8988 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8989 # Explode if not exists 8990 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8991 else: 8992 extra_field_transcript = f"NULL" 8993 8994 # Transcripts of preference source order 8995 transcripts_order = ( 8996 param.get("calculation", {}) 8997 .get("calculations", {}) 8998 .get("NOMEN", {}) 8999 .get("options", {}) 9000 .get("transcripts_order", ["column", "file"]) 9001 ) 9002 9003 # Transcripts from file 9004 transcripts = transcripts_sources.get("file", []) 9005 9006 # Explode HGVS field in column 9007 added_columns += self.explode_infos(fields=[hgvs_field]) 9008 9009 # extra infos 9010 extra_infos = self.get_extra_infos() 9011 extra_field = prefix + hgvs_field 9012 9013 if extra_field in extra_infos: 9014 9015 # Create dataframe 9016 dataframe_hgvs = self.get_query_to_df( 9017 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9018 ) 9019 9020 # Transcripts rank 9021 transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)} 9022 transcripts_len = len(transcripts_rank) 9023 9024 # Create main NOMEN column 9025 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9026 lambda x: find_nomen( 9027 hgvs=x.hgvs, 9028 transcript=x.transcript, 9029 transcripts=transcripts_rank, 9030 pattern=nomen_pattern, 9031 transcripts_source_order=transcripts_order, 9032 transcripts_len=transcripts_len 9033 ), 9034 axis=1, 9035 ) 9036 9037 # Explode NOMEN Structure and create SQL set for update 9038 sql_nomen_fields = [] 9039 for nomen_field in nomen_dict: 9040 9041 # Create VCF header field 9042 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9043 nomen_field, 9044 ".", 9045 "String", 9046 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9047 "howard calculation", 9048 "0", 9049 self.code_type_map.get("String"), 9050 ) 9051 9052 # Add field to SQL query update 9053 sql_nomen_fields.append( 9054 f""" 9055 CASE 9056 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9057 THEN concat( 9058 ';{nomen_field}=', 9059 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9060 ) 9061 ELSE '' 9062 END 9063 """ 9064 ) 9065 9066 # SQL set for update 9067 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9068 9069 # Update 9070 sql_update = f""" 9071 UPDATE variants 9072 SET "INFO" = 9073 concat( 9074 CASE 9075 WHEN "INFO" IS NULL 9076 THEN '' 9077 ELSE "INFO" 9078 END, 9079 {sql_nomen_fields_set} 9080 ) 9081 FROM dataframe_hgvs 9082 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9083 AND variants."POS" = dataframe_hgvs."POS" 9084 AND variants."REF" = dataframe_hgvs."REF" 9085 AND variants."ALT" = dataframe_hgvs."ALT" 9086 """ 9087 self.conn.execute(sql_update) 9088 9089 # Delete dataframe 9090 del dataframe_hgvs 9091 gc.collect() 9092 9093 # Remove added columns 9094 for added_column in added_columns: 9095 self.drop_column(column=added_column) 9096 9097 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9098 """ 9099 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9100 pipeline/sample for a variant and updates the variant information in a VCF file. 9101 9102 :param tag: The `tag` parameter is a string that represents the annotation field for the 9103 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9104 VCF header and to update the corresponding field in the variants table, defaults to 9105 findbypipeline 9106 :type tag: str (optional) 9107 """ 9108 9109 # if FORMAT and samples 9110 if ( 9111 "FORMAT" in self.get_header_columns_as_list() 9112 and self.get_header_sample_list() 9113 ): 9114 9115 # findbypipeline annotation field 9116 findbypipeline_tag = tag 9117 9118 # VCF infos tags 9119 vcf_infos_tags = { 9120 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9121 } 9122 9123 # Prefix 9124 prefix = self.get_explode_infos_prefix() 9125 9126 # Field 9127 findbypipeline_infos = prefix + findbypipeline_tag 9128 9129 # Variants table 9130 table_variants = self.get_table_variants() 9131 9132 # Header 9133 vcf_reader = self.get_header() 9134 9135 # Create variant id 9136 variant_id_column = self.get_variant_id_column() 9137 added_columns = [variant_id_column] 9138 9139 # variant_id, FORMAT and samples 9140 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9141 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9142 ) 9143 9144 # Create dataframe 9145 dataframe_findbypipeline = self.get_query_to_df( 9146 f""" SELECT {samples_fields} FROM {table_variants} """ 9147 ) 9148 9149 # Create findbypipeline column 9150 dataframe_findbypipeline[findbypipeline_infos] = ( 9151 dataframe_findbypipeline.apply( 9152 lambda row: findbypipeline( 9153 row, samples=self.get_header_sample_list() 9154 ), 9155 axis=1, 9156 ) 9157 ) 9158 9159 # Add snpeff_hgvs to header 9160 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9161 findbypipeline_tag, 9162 ".", 9163 "String", 9164 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9165 "howard calculation", 9166 "0", 9167 self.code_type_map.get("String"), 9168 ) 9169 9170 # Update 9171 sql_update = f""" 9172 UPDATE variants 9173 SET "INFO" = 9174 concat( 9175 CASE 9176 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9177 THEN '' 9178 ELSE concat("INFO", ';') 9179 END, 9180 CASE 9181 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9182 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9183 THEN concat( 9184 '{findbypipeline_tag}=', 9185 dataframe_findbypipeline."{findbypipeline_infos}" 9186 ) 9187 ELSE '' 9188 END 9189 ) 9190 FROM dataframe_findbypipeline 9191 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9192 """ 9193 self.conn.execute(sql_update) 9194 9195 # Remove added columns 9196 for added_column in added_columns: 9197 self.drop_column(column=added_column) 9198 9199 # Delete dataframe 9200 del dataframe_findbypipeline 9201 gc.collect() 9202 9203 def calculation_genotype_concordance(self) -> None: 9204 """ 9205 The function `calculation_genotype_concordance` calculates the genotype concordance for 9206 multi-caller VCF files and updates the variant information in the database. 9207 """ 9208 9209 # if FORMAT and samples 9210 if ( 9211 "FORMAT" in self.get_header_columns_as_list() 9212 and self.get_header_sample_list() 9213 ): 9214 9215 # genotypeconcordance annotation field 9216 genotypeconcordance_tag = "genotypeconcordance" 9217 9218 # VCF infos tags 9219 vcf_infos_tags = { 9220 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9221 } 9222 9223 # Prefix 9224 prefix = self.get_explode_infos_prefix() 9225 9226 # Field 9227 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9228 9229 # Variants table 9230 table_variants = self.get_table_variants() 9231 9232 # Header 9233 vcf_reader = self.get_header() 9234 9235 # Create variant id 9236 variant_id_column = self.get_variant_id_column() 9237 added_columns = [variant_id_column] 9238 9239 # variant_id, FORMAT and samples 9240 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9241 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9242 ) 9243 9244 # Create dataframe 9245 dataframe_genotypeconcordance = self.get_query_to_df( 9246 f""" SELECT {samples_fields} FROM {table_variants} """ 9247 ) 9248 9249 # Create genotypeconcordance column 9250 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9251 dataframe_genotypeconcordance.apply( 9252 lambda row: genotypeconcordance( 9253 row, samples=self.get_header_sample_list() 9254 ), 9255 axis=1, 9256 ) 9257 ) 9258 9259 # Add genotypeconcordance to header 9260 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9261 genotypeconcordance_tag, 9262 ".", 9263 "String", 9264 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9265 "howard calculation", 9266 "0", 9267 self.code_type_map.get("String"), 9268 ) 9269 9270 # Update 9271 sql_update = f""" 9272 UPDATE variants 9273 SET "INFO" = 9274 concat( 9275 CASE 9276 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9277 THEN '' 9278 ELSE concat("INFO", ';') 9279 END, 9280 CASE 9281 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9282 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9283 THEN concat( 9284 '{genotypeconcordance_tag}=', 9285 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9286 ) 9287 ELSE '' 9288 END 9289 ) 9290 FROM dataframe_genotypeconcordance 9291 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9292 """ 9293 self.conn.execute(sql_update) 9294 9295 # Remove added columns 9296 for added_column in added_columns: 9297 self.drop_column(column=added_column) 9298 9299 # Delete dataframe 9300 del dataframe_genotypeconcordance 9301 gc.collect() 9302 9303 def calculation_barcode(self, tag: str = "barcode") -> None: 9304 """ 9305 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9306 updates the INFO field in the file with the calculated barcode values. 9307 9308 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9309 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9310 the default tag name is set to "barcode", defaults to barcode 9311 :type tag: str (optional) 9312 """ 9313 9314 # if FORMAT and samples 9315 if ( 9316 "FORMAT" in self.get_header_columns_as_list() 9317 and self.get_header_sample_list() 9318 ): 9319 9320 # barcode annotation field 9321 if not tag: 9322 tag = "barcode" 9323 9324 # VCF infos tags 9325 vcf_infos_tags = { 9326 tag: "barcode calculation (VaRank)", 9327 } 9328 9329 # Prefix 9330 prefix = self.get_explode_infos_prefix() 9331 9332 # Field 9333 barcode_infos = prefix + tag 9334 9335 # Variants table 9336 table_variants = self.get_table_variants() 9337 9338 # Header 9339 vcf_reader = self.get_header() 9340 9341 # Create variant id 9342 variant_id_column = self.get_variant_id_column() 9343 added_columns = [variant_id_column] 9344 9345 # variant_id, FORMAT and samples 9346 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9347 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9348 ) 9349 9350 # Create dataframe 9351 dataframe_barcode = self.get_query_to_df( 9352 f""" SELECT {samples_fields} FROM {table_variants} """ 9353 ) 9354 9355 # Create barcode column 9356 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9357 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9358 ) 9359 9360 # Add barcode to header 9361 vcf_reader.infos[tag] = vcf.parser._Info( 9362 tag, 9363 ".", 9364 "String", 9365 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9366 "howard calculation", 9367 "0", 9368 self.code_type_map.get("String"), 9369 ) 9370 9371 # Update 9372 sql_update = f""" 9373 UPDATE {table_variants} 9374 SET "INFO" = 9375 concat( 9376 CASE 9377 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9378 THEN '' 9379 ELSE concat("INFO", ';') 9380 END, 9381 CASE 9382 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9383 AND dataframe_barcode."{barcode_infos}" NOT NULL 9384 THEN concat( 9385 '{tag}=', 9386 dataframe_barcode."{barcode_infos}" 9387 ) 9388 ELSE '' 9389 END 9390 ) 9391 FROM dataframe_barcode 9392 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9393 """ 9394 self.conn.execute(sql_update) 9395 9396 # Remove added columns 9397 for added_column in added_columns: 9398 self.drop_column(column=added_column) 9399 9400 # Delete dataframe 9401 del dataframe_barcode 9402 gc.collect() 9403 9404 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9405 """ 9406 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9407 and updates the INFO field in the file with the calculated barcode values. 9408 9409 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9410 the barcode tag that will be added to the VCF file during the calculation process. If no value 9411 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9412 :type tag: str (optional) 9413 """ 9414 9415 # if FORMAT and samples 9416 if ( 9417 "FORMAT" in self.get_header_columns_as_list() 9418 and self.get_header_sample_list() 9419 ): 9420 9421 # barcode annotation field 9422 if not tag: 9423 tag = "BCF" 9424 9425 # VCF infos tags 9426 vcf_infos_tags = { 9427 tag: "barcode family calculation", 9428 f"{tag}S": "barcode family samples", 9429 } 9430 9431 # Param 9432 param = self.get_param() 9433 log.debug(f"param={param}") 9434 9435 # Prefix 9436 prefix = self.get_explode_infos_prefix() 9437 9438 # PED param 9439 ped = ( 9440 param.get("calculation", {}) 9441 .get("calculations", {}) 9442 .get("BARCODEFAMILY", {}) 9443 .get("family_pedigree", None) 9444 ) 9445 log.debug(f"ped={ped}") 9446 9447 # Load PED 9448 if ped: 9449 9450 # Pedigree is a file 9451 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9452 log.debug("Pedigree is file") 9453 with open(full_path(ped)) as ped: 9454 ped = yaml.safe_load(ped) 9455 9456 # Pedigree is a string 9457 elif isinstance(ped, str): 9458 log.debug("Pedigree is str") 9459 try: 9460 ped = json.loads(ped) 9461 log.debug("Pedigree is json str") 9462 except ValueError as e: 9463 ped_samples = ped.split(",") 9464 ped = {} 9465 for ped_sample in ped_samples: 9466 ped[ped_sample] = ped_sample 9467 9468 # Pedigree is a dict 9469 elif isinstance(ped, dict): 9470 log.debug("Pedigree is dict") 9471 9472 # Pedigree is not well formatted 9473 else: 9474 msg_error = "Pedigree not well formatted" 9475 log.error(msg_error) 9476 raise ValueError(msg_error) 9477 9478 # Construct list 9479 ped_samples = list(ped.values()) 9480 9481 else: 9482 log.debug("Pedigree not defined. Take all samples") 9483 ped_samples = self.get_header_sample_list() 9484 ped = {} 9485 for ped_sample in ped_samples: 9486 ped[ped_sample] = ped_sample 9487 9488 # Check pedigree 9489 if not ped or len(ped) == 0: 9490 msg_error = f"Error in pedigree: samples {ped_samples}" 9491 log.error(msg_error) 9492 raise ValueError(msg_error) 9493 9494 # Log 9495 log.info( 9496 "Calculation 'BARCODEFAMILY' - Samples: " 9497 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9498 ) 9499 log.debug(f"ped_samples={ped_samples}") 9500 9501 # Field 9502 barcode_infos = prefix + tag 9503 9504 # Variants table 9505 table_variants = self.get_table_variants() 9506 9507 # Header 9508 vcf_reader = self.get_header() 9509 9510 # Create variant id 9511 variant_id_column = self.get_variant_id_column() 9512 added_columns = [variant_id_column] 9513 9514 # variant_id, FORMAT and samples 9515 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9516 [f""" "{sample}" """ for sample in ped_samples] 9517 ) 9518 9519 # Create dataframe 9520 dataframe_barcode = self.get_query_to_df( 9521 f""" SELECT {samples_fields} FROM {table_variants} """ 9522 ) 9523 9524 # Create barcode column 9525 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9526 lambda row: barcode(row, samples=ped_samples), axis=1 9527 ) 9528 9529 # Add barcode family to header 9530 # Add vaf_normalization to header 9531 vcf_reader.formats[tag] = vcf.parser._Format( 9532 id=tag, 9533 num=".", 9534 type="String", 9535 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9536 type_code=self.code_type_map.get("String"), 9537 ) 9538 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9539 id=f"{tag}S", 9540 num=".", 9541 type="String", 9542 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9543 type_code=self.code_type_map.get("String"), 9544 ) 9545 9546 # Update 9547 # for sample in ped_samples: 9548 sql_update_set = [] 9549 for sample in self.get_header_sample_list() + ["FORMAT"]: 9550 if sample in ped_samples: 9551 value = f'dataframe_barcode."{barcode_infos}"' 9552 value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'" 9553 ped_samples 9554 elif sample == "FORMAT": 9555 value = f"'{tag}'" 9556 value_samples = f"'{tag}S'" 9557 else: 9558 value = "'.'" 9559 value_samples = "'.'" 9560 format_regex = r"[a-zA-Z0-9\s]" 9561 sql_update_set.append( 9562 f""" 9563 "{sample}" = 9564 concat( 9565 CASE 9566 WHEN {table_variants}."{sample}" = './.' 9567 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9568 ELSE {table_variants}."{sample}" 9569 END, 9570 ':', 9571 {value}, 9572 ':', 9573 {value_samples} 9574 ) 9575 """ 9576 ) 9577 9578 sql_update_set_join = ", ".join(sql_update_set) 9579 sql_update = f""" 9580 UPDATE {table_variants} 9581 SET {sql_update_set_join} 9582 FROM dataframe_barcode 9583 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9584 """ 9585 self.conn.execute(sql_update) 9586 9587 # Remove added columns 9588 for added_column in added_columns: 9589 self.drop_column(column=added_column) 9590 9591 # Delete dataframe 9592 del dataframe_barcode 9593 gc.collect() 9594 9595 def calculation_trio(self) -> None: 9596 """ 9597 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9598 information to the INFO field of each variant. 9599 """ 9600 9601 # if FORMAT and samples 9602 if ( 9603 "FORMAT" in self.get_header_columns_as_list() 9604 and self.get_header_sample_list() 9605 ): 9606 9607 # trio annotation field 9608 trio_tag = "trio" 9609 9610 # VCF infos tags 9611 vcf_infos_tags = { 9612 "trio": "trio calculation", 9613 } 9614 9615 # Param 9616 param = self.get_param() 9617 9618 # Prefix 9619 prefix = self.get_explode_infos_prefix() 9620 9621 # Trio param 9622 trio_ped = ( 9623 param.get("calculation", {}) 9624 .get("calculations", {}) 9625 .get("TRIO", {}) 9626 .get("trio_pedigree", None) 9627 ) 9628 9629 # Load trio 9630 if trio_ped: 9631 9632 # Trio pedigree is a file 9633 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9634 log.debug("TRIO pedigree is file") 9635 with open(full_path(trio_ped)) as trio_ped: 9636 trio_ped = yaml.safe_load(trio_ped) 9637 9638 # Trio pedigree is a string 9639 elif isinstance(trio_ped, str): 9640 log.debug("TRIO pedigree is str") 9641 try: 9642 trio_ped = json.loads(trio_ped) 9643 log.debug("TRIO pedigree is json str") 9644 except ValueError as e: 9645 trio_samples = trio_ped.split(",") 9646 if len(trio_samples) == 3: 9647 trio_ped = { 9648 "father": trio_samples[0], 9649 "mother": trio_samples[1], 9650 "child": trio_samples[2], 9651 } 9652 log.debug("TRIO pedigree is list str") 9653 else: 9654 msg_error = "TRIO pedigree not well formatted" 9655 log.error(msg_error) 9656 raise ValueError(msg_error) 9657 9658 # Trio pedigree is a dict 9659 elif isinstance(trio_ped, dict): 9660 log.debug("TRIO pedigree is dict") 9661 9662 # Trio pedigree is not well formatted 9663 else: 9664 msg_error = "TRIO pedigree not well formatted" 9665 log.error(msg_error) 9666 raise ValueError(msg_error) 9667 9668 # Construct trio list 9669 trio_samples = [ 9670 trio_ped.get("father", ""), 9671 trio_ped.get("mother", ""), 9672 trio_ped.get("child", ""), 9673 ] 9674 9675 else: 9676 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9677 samples_list = self.get_header_sample_list() 9678 if len(samples_list) >= 3: 9679 trio_samples = self.get_header_sample_list()[0:3] 9680 trio_ped = { 9681 "father": trio_samples[0], 9682 "mother": trio_samples[1], 9683 "child": trio_samples[2], 9684 } 9685 else: 9686 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9687 log.error(msg_error) 9688 raise ValueError(msg_error) 9689 9690 # Check trio pedigree 9691 if not trio_ped or len(trio_ped) != 3: 9692 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9693 log.error(msg_error) 9694 raise ValueError(msg_error) 9695 9696 # Log 9697 log.info( 9698 f"Calculation 'TRIO' - Samples: " 9699 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9700 ) 9701 9702 # Field 9703 trio_infos = prefix + trio_tag 9704 9705 # Variants table 9706 table_variants = self.get_table_variants() 9707 9708 # Header 9709 vcf_reader = self.get_header() 9710 9711 # Create variant id 9712 variant_id_column = self.get_variant_id_column() 9713 added_columns = [variant_id_column] 9714 9715 # variant_id, FORMAT and samples 9716 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9717 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9718 ) 9719 9720 # Create dataframe 9721 dataframe_trio = self.get_query_to_df( 9722 f""" SELECT {samples_fields} FROM {table_variants} """ 9723 ) 9724 9725 # Create trio column 9726 dataframe_trio[trio_infos] = dataframe_trio.apply( 9727 lambda row: trio(row, samples=trio_samples), axis=1 9728 ) 9729 9730 # Add trio to header 9731 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9732 trio_tag, 9733 ".", 9734 "String", 9735 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9736 "howard calculation", 9737 "0", 9738 self.code_type_map.get("String"), 9739 ) 9740 9741 # Update 9742 sql_update = f""" 9743 UPDATE {table_variants} 9744 SET "INFO" = 9745 concat( 9746 CASE 9747 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9748 THEN '' 9749 ELSE concat("INFO", ';') 9750 END, 9751 CASE 9752 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9753 AND dataframe_trio."{trio_infos}" NOT NULL 9754 THEN concat( 9755 '{trio_tag}=', 9756 dataframe_trio."{trio_infos}" 9757 ) 9758 ELSE '' 9759 END 9760 ) 9761 FROM dataframe_trio 9762 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9763 """ 9764 self.conn.execute(sql_update) 9765 9766 # Remove added columns 9767 for added_column in added_columns: 9768 self.drop_column(column=added_column) 9769 9770 # Delete dataframe 9771 del dataframe_trio 9772 gc.collect() 9773 9774 def calculation_vaf_normalization(self) -> None: 9775 """ 9776 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9777 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9778 :return: The function does not return anything. 9779 """ 9780 9781 # if FORMAT and samples 9782 if ( 9783 "FORMAT" in self.get_header_columns_as_list() 9784 and self.get_header_sample_list() 9785 ): 9786 9787 # vaf_normalization annotation field 9788 vaf_normalization_tag = "VAF" 9789 9790 # VCF infos tags 9791 vcf_infos_tags = { 9792 "VAF": "VAF Variant Frequency", 9793 } 9794 9795 # Prefix 9796 prefix = self.get_explode_infos_prefix() 9797 9798 # Variants table 9799 table_variants = self.get_table_variants() 9800 9801 # Header 9802 vcf_reader = self.get_header() 9803 9804 # Do not calculate if VAF already exists 9805 if "VAF" in vcf_reader.formats: 9806 log.debug("VAF already on genotypes") 9807 return 9808 9809 # Create variant id 9810 variant_id_column = self.get_variant_id_column() 9811 added_columns = [variant_id_column] 9812 9813 # variant_id, FORMAT and samples 9814 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9815 f""" "{sample}" """ for sample in self.get_header_sample_list() 9816 ) 9817 9818 # Create dataframe 9819 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9820 log.debug(f"query={query}") 9821 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9822 9823 vaf_normalization_set = [] 9824 9825 # for each sample vaf_normalization 9826 for sample in self.get_header_sample_list(): 9827 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9828 lambda row: vaf_normalization(row, sample=sample), axis=1 9829 ) 9830 vaf_normalization_set.append( 9831 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9832 ) 9833 9834 # Add VAF to FORMAT 9835 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9836 "FORMAT" 9837 ].apply(lambda x: str(x) + ":VAF") 9838 vaf_normalization_set.append( 9839 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9840 ) 9841 9842 # Add vaf_normalization to header 9843 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9844 id=vaf_normalization_tag, 9845 num="1", 9846 type="Float", 9847 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9848 type_code=self.code_type_map.get("Float"), 9849 ) 9850 9851 # Create fields to add in INFO 9852 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9853 9854 # Update 9855 sql_update = f""" 9856 UPDATE {table_variants} 9857 SET {sql_vaf_normalization_set} 9858 FROM dataframe_vaf_normalization 9859 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9860 9861 """ 9862 self.conn.execute(sql_update) 9863 9864 # Remove added columns 9865 for added_column in added_columns: 9866 self.drop_column(column=added_column) 9867 9868 # Delete dataframe 9869 del dataframe_vaf_normalization 9870 gc.collect() 9871 9872 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9873 """ 9874 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9875 field in a VCF file and updates the INFO column of the variants table with the calculated 9876 statistics. 9877 9878 :param info: The `info` parameter is a string that represents the type of information for which 9879 genotype statistics are calculated. It is used to generate various VCF info tags for the 9880 statistics, such as the number of occurrences, the list of values, the minimum value, the 9881 maximum value, the mean, the median, defaults to VAF 9882 :type info: str (optional) 9883 """ 9884 9885 # if FORMAT and samples 9886 if ( 9887 "FORMAT" in self.get_header_columns_as_list() 9888 and self.get_header_sample_list() 9889 ): 9890 9891 # vaf_stats annotation field 9892 vaf_stats_tag = info + "_stats" 9893 9894 # VCF infos tags 9895 vcf_infos_tags = { 9896 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9897 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9898 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9899 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9900 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9901 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9902 info 9903 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9904 } 9905 9906 # Prefix 9907 prefix = self.get_explode_infos_prefix() 9908 9909 # Field 9910 vaf_stats_infos = prefix + vaf_stats_tag 9911 9912 # Variants table 9913 table_variants = self.get_table_variants() 9914 9915 # Header 9916 vcf_reader = self.get_header() 9917 9918 # Create variant id 9919 variant_id_column = self.get_variant_id_column() 9920 added_columns = [variant_id_column] 9921 9922 # variant_id, FORMAT and samples 9923 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9924 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9925 ) 9926 9927 # Create dataframe 9928 dataframe_vaf_stats = self.get_query_to_df( 9929 f""" SELECT {samples_fields} FROM {table_variants} """ 9930 ) 9931 9932 # Create vaf_stats column 9933 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9934 lambda row: genotype_stats( 9935 row, samples=self.get_header_sample_list(), info=info 9936 ), 9937 axis=1, 9938 ) 9939 9940 # List of vcf tags 9941 sql_vaf_stats_fields = [] 9942 9943 # Check all VAF stats infos 9944 for stat in vcf_infos_tags: 9945 9946 # Extract stats 9947 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9948 lambda x: dict(x).get(stat, "") 9949 ) 9950 9951 # Add snpeff_hgvs to header 9952 vcf_reader.infos[stat] = vcf.parser._Info( 9953 stat, 9954 ".", 9955 "String", 9956 vcf_infos_tags.get(stat, "genotype statistics"), 9957 "howard calculation", 9958 "0", 9959 self.code_type_map.get("String"), 9960 ) 9961 9962 if len(sql_vaf_stats_fields): 9963 sep = ";" 9964 else: 9965 sep = "" 9966 9967 # Create fields to add in INFO 9968 sql_vaf_stats_fields.append( 9969 f""" 9970 CASE 9971 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9972 THEN concat( 9973 '{sep}{stat}=', 9974 dataframe_vaf_stats."{stat}" 9975 ) 9976 ELSE '' 9977 END 9978 """ 9979 ) 9980 9981 # SQL set for update 9982 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9983 9984 # Update 9985 sql_update = f""" 9986 UPDATE {table_variants} 9987 SET "INFO" = 9988 concat( 9989 CASE 9990 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9991 THEN '' 9992 ELSE concat("INFO", ';') 9993 END, 9994 {sql_vaf_stats_fields_set} 9995 ) 9996 FROM dataframe_vaf_stats 9997 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9998 9999 """ 10000 self.conn.execute(sql_update) 10001 10002 # Remove added columns 10003 for added_column in added_columns: 10004 self.drop_column(column=added_column) 10005 10006 # Delete dataframe 10007 del dataframe_vaf_stats 10008 gc.collect() 10009 10010 def calculation_transcripts_annotation( 10011 self, info_json: str = None, info_format: str = None 10012 ) -> None: 10013 """ 10014 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10015 field to it if transcripts are available. 10016 10017 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10018 is a string parameter that represents the information field to be used in the transcripts JSON. 10019 It is used to specify the JSON format for the transcripts information. If no value is provided 10020 when calling the method, it defaults to " 10021 :type info_json: str 10022 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10023 method is a string parameter that specifies the format of the information field to be used in 10024 the transcripts JSON. It is used to define the format of the information field 10025 :type info_format: str 10026 """ 10027 10028 # Create transcripts table 10029 transcripts_table = self.create_transcript_view() 10030 10031 # Add info field 10032 if transcripts_table: 10033 self.transcript_view_to_variants( 10034 transcripts_table=transcripts_table, 10035 transcripts_info_field_json=info_json, 10036 transcripts_info_field_format=info_format, 10037 ) 10038 else: 10039 log.info("No Transcripts to process. Check param.json file configuration") 10040 10041 def calculation_transcripts_prioritization(self) -> None: 10042 """ 10043 The function `calculation_transcripts_prioritization` creates a transcripts table and 10044 prioritizes transcripts based on certain criteria. 10045 """ 10046 10047 # Create transcripts table 10048 transcripts_table = self.create_transcript_view() 10049 10050 # Add info field 10051 if transcripts_table: 10052 self.transcripts_prioritization(transcripts_table=transcripts_table) 10053 else: 10054 log.info("No Transcripts to process. Check param.json file configuration") 10055 10056 def calculation_transcripts_export(self) -> None: 10057 """ """ 10058 10059 # Create transcripts table 10060 transcripts_table = self.create_transcript_view() 10061 10062 # Add info field 10063 if transcripts_table: 10064 self.transcripts_export(transcripts_table=transcripts_table) 10065 else: 10066 log.info("No Transcripts to process. Check param.json file configuration") 10067 10068 ############### 10069 # Transcripts # 10070 ############### 10071 10072 def transcripts_export( 10073 self, transcripts_table: str = None, param: dict = {} 10074 ) -> bool: 10075 """ """ 10076 10077 log.debug("Start transcripts export...") 10078 10079 # Param 10080 if not param: 10081 param = self.get_param() 10082 10083 # Param export 10084 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10085 10086 # Output file 10087 transcripts_export_output = param_transcript_export.get("output", None) 10088 10089 if not param_transcript_export or not transcripts_export_output: 10090 log.warning(f"No transcriipts export parameters defined!") 10091 return False 10092 10093 # List of transcripts annotations 10094 query_describe = f""" 10095 SELECT column_name 10096 FROM ( 10097 DESCRIBE SELECT * FROM {transcripts_table} 10098 ) 10099 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10100 """ 10101 transcripts_annotations_list = list( 10102 self.get_query_to_df(query=query_describe)["column_name"] 10103 ) 10104 10105 # Create transcripts table for export 10106 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10107 random.choices(string.ascii_uppercase + string.digits, k=10) 10108 ) 10109 query_create_transcripts_table_export = f""" 10110 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10111 """ 10112 self.execute_query(query=query_create_transcripts_table_export) 10113 10114 # Output file format 10115 transcripts_export_output_format = get_file_format( 10116 filename=transcripts_export_output 10117 ) 10118 10119 # Format VCF - construct INFO 10120 if transcripts_export_output_format in ["vcf"]: 10121 10122 # Construct query update INFO and header 10123 query_update_info = [] 10124 for field in transcripts_annotations_list: 10125 10126 # If field not in header 10127 if field not in self.get_header_infos_list(): 10128 10129 # Add PZ Transcript in header 10130 self.get_header().infos[field] = vcf.parser._Info( 10131 field, 10132 ".", 10133 "String", 10134 f"Annotation '{field}' from transcript view", 10135 "unknown", 10136 "unknown", 10137 0, 10138 ) 10139 10140 # Add field as INFO/tag 10141 query_update_info.append( 10142 f""" 10143 CASE 10144 WHEN "{field}" IS NOT NULL 10145 THEN concat('{field}=', "{field}", ';') 10146 ELSE '' 10147 END 10148 """ 10149 ) 10150 10151 # Query param 10152 query_update_info_value = ( 10153 f""" concat('', {", ".join(query_update_info)}) """ 10154 ) 10155 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10156 10157 else: 10158 10159 # Query param 10160 query_update_info_value = f""" NULL """ 10161 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10162 10163 # Update query INFO column 10164 query_update = f""" 10165 UPDATE {transcripts_table_export} 10166 SET INFO = {query_update_info_value} 10167 10168 """ 10169 self.execute_query(query=query_update) 10170 10171 # Export 10172 self.export_output( 10173 output_file=transcripts_export_output, 10174 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10175 ) 10176 10177 # Drop transcripts export table 10178 query_drop_transcripts_table_export = f""" 10179 DROP TABLE {transcripts_table_export} 10180 """ 10181 self.execute_query(query=query_drop_transcripts_table_export) 10182 10183 def transcripts_prioritization( 10184 self, transcripts_table: str = None, param: dict = {} 10185 ) -> bool: 10186 """ 10187 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10188 and updates the variants table with the prioritized information. 10189 10190 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10191 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10192 This parameter is used to identify the table where the transcripts data is stored for the 10193 prioritization process 10194 :type transcripts_table: str 10195 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10196 that contains various configuration settings for the prioritization process of transcripts. It 10197 is used to customize the behavior of the prioritization algorithm and includes settings such as 10198 the prefix for prioritization fields, default profiles, and other 10199 :type param: dict 10200 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10201 transcripts prioritization process is successfully completed, and `False` if there are any 10202 issues or if no profile is defined for transcripts prioritization. 10203 """ 10204 10205 log.debug("Start transcripts prioritization...") 10206 10207 # Param 10208 if not param: 10209 param = self.get_param() 10210 10211 # Variants table 10212 table_variants = self.get_table_variants() 10213 10214 # Transcripts table 10215 if transcripts_table is None: 10216 transcripts_table = self.create_transcript_view( 10217 transcripts_table="transcripts", param=param 10218 ) 10219 if transcripts_table is None: 10220 msg_err = "No Transcripts table availalble" 10221 log.error(msg_err) 10222 raise ValueError(msg_err) 10223 log.debug(f"transcripts_table={transcripts_table}") 10224 10225 # Get transcripts columns 10226 columns_as_list_query = f""" 10227 DESCRIBE {transcripts_table} 10228 """ 10229 columns_as_list = list( 10230 self.get_query_to_df(columns_as_list_query)["column_name"] 10231 ) 10232 10233 # Create INFO if not exists 10234 if "INFO" not in columns_as_list: 10235 query_add_info = f""" 10236 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10237 """ 10238 self.execute_query(query_add_info) 10239 10240 # Prioritization param and Force only PZ Score and Flag 10241 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10242 10243 # PZ profile by default 10244 pz_profile_default = ( 10245 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10246 ) 10247 10248 # Exit if no profile 10249 if pz_profile_default is None: 10250 log.warning("No profile defined for transcripts prioritization") 10251 return False 10252 10253 # PZ fields 10254 pz_param_pzfields = {} 10255 10256 # PZ field transcripts 10257 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10258 10259 # Add PZ Transcript in header 10260 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10261 pz_fields_transcripts, 10262 ".", 10263 "String", 10264 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10265 "unknown", 10266 "unknown", 10267 code_type_map["String"], 10268 ) 10269 10270 # Mandatory fields 10271 pz_mandatory_fields_list = [ 10272 "Score", 10273 "Flag", 10274 "Tags", 10275 "Comment", 10276 "Infos", 10277 "Class", 10278 ] 10279 pz_mandatory_fields = [] 10280 for pz_mandatory_field in pz_mandatory_fields_list: 10281 pz_mandatory_fields.append( 10282 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10283 ) 10284 10285 # PZ fields in param 10286 for pz_field in pz_param.get("pzfields", []): 10287 if pz_field in pz_mandatory_fields_list: 10288 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10289 pz_param.get("pzprefix", "PTZ") + pz_field 10290 ) 10291 else: 10292 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10293 pz_param_pzfields[pz_field] = pz_field_new 10294 10295 # Add PZ Transcript in header 10296 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10297 pz_field_new, 10298 ".", 10299 "String", 10300 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10301 "unknown", 10302 "unknown", 10303 code_type_map["String"], 10304 ) 10305 10306 # PZ fields param 10307 pz_param["pzfields"] = pz_mandatory_fields 10308 10309 # Prioritization 10310 prioritization_result = self.prioritization( 10311 table=transcripts_table, 10312 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10313 ) 10314 if not prioritization_result: 10315 log.warning("Transcripts prioritization not processed") 10316 return False 10317 10318 # PZ fields sql query 10319 query_update_select_list = [] 10320 query_update_concat_list = [] 10321 query_update_order_list = [] 10322 for pz_param_pzfield in set( 10323 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10324 ): 10325 query_update_select_list.append(f" {pz_param_pzfield}, ") 10326 10327 for pz_param_pzfield in pz_param_pzfields: 10328 query_update_concat_list.append( 10329 f""" 10330 , CASE 10331 WHEN {pz_param_pzfield} IS NOT NULL 10332 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10333 ELSE '' 10334 END 10335 """ 10336 ) 10337 10338 # Order by 10339 pz_orders = ( 10340 param.get("transcripts", {}) 10341 .get("prioritization", {}) 10342 .get("prioritization_transcripts_order", {}) 10343 ) 10344 if not pz_orders: 10345 pz_orders = { 10346 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10347 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10348 } 10349 for pz_order in pz_orders: 10350 query_update_order_list.append( 10351 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10352 ) 10353 10354 # Fields to explode 10355 fields_to_explode = ( 10356 list(pz_param_pzfields.keys()) 10357 + pz_mandatory_fields 10358 + list(pz_orders.keys()) 10359 ) 10360 # Remove transcript column as a specific transcript column 10361 if "transcript" in fields_to_explode: 10362 fields_to_explode.remove("transcript") 10363 10364 # Fields intranscripts table 10365 query_transcripts_table = f""" 10366 DESCRIBE SELECT * FROM {transcripts_table} 10367 """ 10368 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10369 10370 # Check fields to explode 10371 for field_to_explode in fields_to_explode: 10372 if field_to_explode not in self.get_header_infos_list() + list( 10373 query_transcripts_table.column_name 10374 ): 10375 msg_err = f"INFO/{field_to_explode} NOT IN header" 10376 log.error(msg_err) 10377 raise ValueError(msg_err) 10378 10379 # Explode fields to explode 10380 self.explode_infos( 10381 table=transcripts_table, 10382 fields=fields_to_explode, 10383 ) 10384 10385 # Transcript preference file 10386 transcripts_preference_file = ( 10387 param.get("transcripts", {}) 10388 .get("prioritization", {}) 10389 .get("prioritization_transcripts", {}) 10390 ) 10391 transcripts_preference_file = full_path(transcripts_preference_file) 10392 10393 # Transcript preference forced 10394 transcript_preference_force = ( 10395 param.get("transcripts", {}) 10396 .get("prioritization", {}) 10397 .get("prioritization_transcripts_force", False) 10398 ) 10399 # Transcript version forced 10400 transcript_version_force = ( 10401 param.get("transcripts", {}) 10402 .get("prioritization", {}) 10403 .get("prioritization_transcripts_version_force", False) 10404 ) 10405 10406 # Transcripts Ranking 10407 if transcripts_preference_file: 10408 10409 # Transcripts file to dataframe 10410 if os.path.exists(transcripts_preference_file): 10411 transcripts_preference_dataframe = transcripts_file_to_df( 10412 transcripts_preference_file 10413 ) 10414 else: 10415 log.error( 10416 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10417 ) 10418 raise ValueError( 10419 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10420 ) 10421 10422 # Order by depending to transcript preference forcing 10423 if transcript_preference_force: 10424 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10425 else: 10426 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10427 10428 # Transcript columns joined depend on version consideration 10429 if transcript_version_force: 10430 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10431 else: 10432 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10433 10434 # Query ranking for update 10435 query_update_ranking = f""" 10436 SELECT 10437 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10438 ROW_NUMBER() OVER ( 10439 PARTITION BY "#CHROM", POS, REF, ALT 10440 ORDER BY {order_by} 10441 ) AS rn 10442 FROM {transcripts_table} 10443 LEFT JOIN 10444 ( 10445 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10446 FROM transcripts_preference_dataframe 10447 ) AS transcripts_preference 10448 ON {transcripts_version_join} 10449 """ 10450 10451 else: 10452 10453 # Query ranking for update 10454 query_update_ranking = f""" 10455 SELECT 10456 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10457 ROW_NUMBER() OVER ( 10458 PARTITION BY "#CHROM", POS, REF, ALT 10459 ORDER BY {" , ".join(query_update_order_list)} 10460 ) AS rn 10461 FROM {transcripts_table} 10462 """ 10463 10464 # Export Transcripts prioritization infos to variants table 10465 query_update = f""" 10466 WITH RankedTranscripts AS ( 10467 {query_update_ranking} 10468 ) 10469 UPDATE {table_variants} 10470 SET 10471 INFO = CONCAT(CASE 10472 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10473 THEN '' 10474 ELSE concat("INFO", ';') 10475 END, 10476 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10477 ) 10478 FROM 10479 RankedTranscripts 10480 WHERE 10481 rn = 1 10482 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10483 AND variants."POS" = RankedTranscripts."POS" 10484 AND variants."REF" = RankedTranscripts."REF" 10485 AND variants."ALT" = RankedTranscripts."ALT" 10486 """ 10487 10488 # log.debug(f"query_update={query_update}") 10489 self.execute_query(query=query_update) 10490 10491 # Return 10492 return True 10493 10494 def create_transcript_view_from_columns_map( 10495 self, 10496 transcripts_table: str = "transcripts", 10497 columns_maps: dict = {}, 10498 added_columns: list = [], 10499 temporary_tables: list = None, 10500 annotation_fields: list = None, 10501 column_rename: dict = {}, 10502 column_clean: bool = False, 10503 column_case: str = None, 10504 ) -> tuple[list, list, list]: 10505 """ 10506 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10507 specified columns mapping for transcripts data. 10508 10509 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10510 of the table where the transcripts data is stored or will be stored in the database. This table 10511 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10512 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10513 :type transcripts_table: str (optional) 10514 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10515 about how to map columns from a transcripts table to create a view. Each entry in the 10516 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10517 typically includes details such as the main transcript column and additional information columns 10518 :type columns_maps: dict 10519 :param added_columns: The `added_columns` parameter in the 10520 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10521 that will be added to the view being created based on the columns map provided. These columns 10522 are generated by exploding the transcript information columns along with the main transcript 10523 column 10524 :type added_columns: list 10525 :param temporary_tables: The `temporary_tables` parameter in the 10526 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10527 tables created during the process of creating a transcript view from a columns map. These 10528 temporary tables are used to store intermediate results or transformations before the final view 10529 is generated 10530 :type temporary_tables: list 10531 :param annotation_fields: The `annotation_fields` parameter in the 10532 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10533 used for annotation in the query view creation process. These fields are extracted from the 10534 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10535 :type annotation_fields: list 10536 :param column_rename: The `column_rename` parameter in the 10537 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10538 custom renaming for columns during the creation of the temporary table view. This parameter 10539 provides a mapping of original column names to the desired renamed column names. By using this 10540 parameter, 10541 :type column_rename: dict 10542 :param column_clean: The `column_clean` parameter in the 10543 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10544 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10545 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10546 False 10547 :type column_clean: bool (optional) 10548 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10549 function is used to specify the case transformation to be applied to the columns during the view 10550 creation process. It allows you to control whether the column values should be converted to 10551 lowercase, uppercase, or remain unchanged 10552 :type column_case: str 10553 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10554 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10555 """ 10556 10557 log.debug("Start transcrpts view creation from columns map...") 10558 10559 # "from_columns_map": [ 10560 # { 10561 # "transcripts_column": "Ensembl_transcriptid", 10562 # "transcripts_infos_columns": [ 10563 # "genename", 10564 # "Ensembl_geneid", 10565 # "LIST_S2_score", 10566 # "LIST_S2_pred", 10567 # ], 10568 # }, 10569 # { 10570 # "transcripts_column": "Ensembl_transcriptid", 10571 # "transcripts_infos_columns": [ 10572 # "genename", 10573 # "VARITY_R_score", 10574 # "Aloft_pred", 10575 # ], 10576 # }, 10577 # ], 10578 10579 # Init 10580 if temporary_tables is None: 10581 temporary_tables = [] 10582 if annotation_fields is None: 10583 annotation_fields = [] 10584 10585 # Variants table 10586 table_variants = self.get_table_variants() 10587 10588 for columns_map in columns_maps: 10589 10590 # Transcript column 10591 transcripts_column = columns_map.get("transcripts_column", None) 10592 10593 # Transcripts infos columns 10594 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10595 10596 # Transcripts infos columns rename 10597 column_rename = columns_map.get("column_rename", column_rename) 10598 10599 # Transcripts infos columns clean 10600 column_clean = columns_map.get("column_clean", column_clean) 10601 10602 # Transcripts infos columns case 10603 column_case = columns_map.get("column_case", column_case) 10604 10605 if transcripts_column is not None: 10606 10607 # Explode 10608 added_columns += self.explode_infos( 10609 fields=[transcripts_column] + transcripts_infos_columns 10610 ) 10611 10612 # View clauses 10613 clause_select_variants = [] 10614 clause_select_tanscripts = [] 10615 for field in [transcripts_column] + transcripts_infos_columns: 10616 10617 # AS field 10618 as_field = field 10619 10620 # Rename 10621 if column_rename: 10622 as_field = column_rename.get(as_field, as_field) 10623 10624 # Clean 10625 if column_clean: 10626 as_field = clean_annotation_field(as_field) 10627 10628 # Case 10629 if column_case: 10630 if column_case.lower() in ["lower"]: 10631 as_field = as_field.lower() 10632 elif column_case.lower() in ["upper"]: 10633 as_field = as_field.upper() 10634 10635 # Clause select Variants 10636 clause_select_variants.append( 10637 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10638 ) 10639 10640 if field in [transcripts_column]: 10641 clause_select_tanscripts.append( 10642 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10643 ) 10644 else: 10645 clause_select_tanscripts.append( 10646 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10647 ) 10648 annotation_fields.append(as_field) 10649 10650 # Querey View 10651 query = f""" 10652 SELECT 10653 "#CHROM", POS, REF, ALT, INFO, 10654 "{transcripts_column}" AS 'transcript', 10655 {", ".join(clause_select_tanscripts)} 10656 FROM ( 10657 SELECT 10658 "#CHROM", POS, REF, ALT, INFO, 10659 {", ".join(clause_select_variants)} 10660 FROM {table_variants} 10661 ) 10662 WHERE "{transcripts_column}" IS NOT NULL 10663 """ 10664 10665 # Create temporary table 10666 temporary_table = transcripts_table + "".join( 10667 random.choices(string.ascii_uppercase + string.digits, k=10) 10668 ) 10669 10670 # Temporary_tables 10671 temporary_tables.append(temporary_table) 10672 query_view = f""" 10673 CREATE TEMPORARY TABLE {temporary_table} 10674 AS ({query}) 10675 """ 10676 self.execute_query(query=query_view) 10677 10678 return added_columns, temporary_tables, annotation_fields 10679 10680 def create_transcript_view_from_column_format( 10681 self, 10682 transcripts_table: str = "transcripts", 10683 column_formats: dict = {}, 10684 temporary_tables: list = None, 10685 annotation_fields: list = None, 10686 column_rename: dict = {}, 10687 column_clean: bool = False, 10688 column_case: str = None, 10689 ) -> tuple[list, list, list]: 10690 """ 10691 The `create_transcript_view_from_column_format` function generates a transcript view based on 10692 specified column formats, adds additional columns and annotation fields, and returns the list of 10693 temporary tables and annotation fields. 10694 10695 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10696 of the table containing the transcripts data. This table will be used as the base table for 10697 creating the transcript view. The default value for this parameter is "transcripts", but you can 10698 provide a different table name if needed, defaults to transcripts 10699 :type transcripts_table: str (optional) 10700 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10701 about the columns to be used for creating the transcript view. Each entry in the dictionary 10702 specifies the mapping between a transcripts column and a transcripts infos column. This 10703 parameter allows you to define how the columns from the transcripts table should be transformed 10704 or mapped 10705 :type column_formats: dict 10706 :param temporary_tables: The `temporary_tables` parameter in the 10707 `create_transcript_view_from_column_format` function is a list that stores the names of 10708 temporary views created during the process of creating a transcript view from a column format. 10709 These temporary views are used to manipulate and extract data before generating the final 10710 transcript view 10711 :type temporary_tables: list 10712 :param annotation_fields: The `annotation_fields` parameter in the 10713 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10714 that are extracted from the temporary views created during the process. These annotation fields 10715 are obtained by querying the temporary views and extracting the column names excluding specific 10716 columns like `#CH 10717 :type annotation_fields: list 10718 :param column_rename: The `column_rename` parameter in the 10719 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10720 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10721 column names to new column names in this dictionary, you can rename specific columns during the 10722 process 10723 :type column_rename: dict 10724 :param column_clean: The `column_clean` parameter in the 10725 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10726 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10727 will be cleaned during the creation of the transcript view based on the specified column format, 10728 defaults to False 10729 :type column_clean: bool (optional) 10730 :param column_case: The `column_case` parameter in the 10731 `create_transcript_view_from_column_format` function is used to specify the case transformation 10732 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10733 to convert the column names to uppercase or lowercase, respectively 10734 :type column_case: str 10735 :return: The `create_transcript_view_from_column_format` function returns two lists: 10736 `temporary_tables` and `annotation_fields`. 10737 """ 10738 10739 log.debug("Start transcrpts view creation from column format...") 10740 10741 # "from_column_format": [ 10742 # { 10743 # "transcripts_column": "ANN", 10744 # "transcripts_infos_column": "Feature_ID", 10745 # } 10746 # ], 10747 10748 # Init 10749 if temporary_tables is None: 10750 temporary_tables = [] 10751 if annotation_fields is None: 10752 annotation_fields = [] 10753 10754 for column_format in column_formats: 10755 10756 # annotation field and transcript annotation field 10757 annotation_field = column_format.get("transcripts_column", "ANN") 10758 transcript_annotation = column_format.get( 10759 "transcripts_infos_column", "Feature_ID" 10760 ) 10761 10762 # Transcripts infos columns rename 10763 column_rename = column_format.get("column_rename", column_rename) 10764 10765 # Transcripts infos columns clean 10766 column_clean = column_format.get("column_clean", column_clean) 10767 10768 # Transcripts infos columns case 10769 column_case = column_format.get("column_case", column_case) 10770 10771 # Temporary View name 10772 temporary_view_name = transcripts_table + "".join( 10773 random.choices(string.ascii_uppercase + string.digits, k=10) 10774 ) 10775 10776 # Create temporary view name 10777 temporary_view_name = self.annotation_format_to_table( 10778 uniquify=True, 10779 annotation_field=annotation_field, 10780 view_name=temporary_view_name, 10781 annotation_id=transcript_annotation, 10782 column_rename=column_rename, 10783 column_clean=column_clean, 10784 column_case=column_case, 10785 ) 10786 10787 # Annotation fields 10788 if temporary_view_name: 10789 query_annotation_fields = f""" 10790 SELECT * 10791 FROM ( 10792 DESCRIBE SELECT * 10793 FROM {temporary_view_name} 10794 ) 10795 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10796 """ 10797 df_annotation_fields = self.get_query_to_df( 10798 query=query_annotation_fields 10799 ) 10800 10801 # Add temporary view and annotation fields 10802 temporary_tables.append(temporary_view_name) 10803 annotation_fields += list(set(df_annotation_fields["column_name"])) 10804 10805 return temporary_tables, annotation_fields 10806 10807 def create_transcript_view( 10808 self, 10809 transcripts_table: str = None, 10810 transcripts_table_drop: bool = False, 10811 param: dict = {}, 10812 ) -> str: 10813 """ 10814 The `create_transcript_view` function generates a transcript view by processing data from a 10815 specified table based on provided parameters and structural information. 10816 10817 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10818 is used to specify the name of the table that will store the final transcript view data. If a table 10819 name is not provided, the function will create a new table to store the transcript view data, and by 10820 default,, defaults to transcripts 10821 :type transcripts_table: str (optional) 10822 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10823 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10824 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10825 the function will drop the existing transcripts table if it exists, defaults to False 10826 :type transcripts_table_drop: bool (optional) 10827 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10828 contains information needed to create a transcript view. It includes details such as the structure 10829 of the transcripts, columns mapping, column formats, and other necessary information for generating 10830 the view. This parameter allows for flexibility and customization 10831 :type param: dict 10832 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10833 created or modified during the execution of the function. 10834 """ 10835 10836 log.debug("Start transcripts view creation...") 10837 10838 # Default 10839 transcripts_table_default = "transcripts" 10840 10841 # Param 10842 if not param: 10843 param = self.get_param() 10844 10845 # Struct 10846 struct = param.get("transcripts", {}).get("struct", None) 10847 10848 # Transcript veresion 10849 transcript_id_remove_version = param.get("transcripts", {}).get( 10850 "transcript_id_remove_version", False 10851 ) 10852 10853 # Transcripts mapping 10854 transcript_id_mapping_file = param.get("transcripts", {}).get( 10855 "transcript_id_mapping_file", None 10856 ) 10857 10858 # Transcripts mapping 10859 transcript_id_mapping_force = param.get("transcripts", {}).get( 10860 "transcript_id_mapping_force", None 10861 ) 10862 10863 if struct: 10864 10865 # Transcripts table 10866 if transcripts_table is None: 10867 transcripts_table = param.get("transcripts", {}).get( 10868 "table", transcripts_table_default 10869 ) 10870 10871 # added_columns 10872 added_columns = [] 10873 10874 # Temporary tables 10875 temporary_tables = [] 10876 10877 # Annotation fields 10878 annotation_fields = [] 10879 10880 # from columns map 10881 columns_maps = struct.get("from_columns_map", []) 10882 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10883 self.create_transcript_view_from_columns_map( 10884 transcripts_table=transcripts_table, 10885 columns_maps=columns_maps, 10886 added_columns=added_columns, 10887 temporary_tables=temporary_tables, 10888 annotation_fields=annotation_fields, 10889 ) 10890 ) 10891 added_columns += added_columns_tmp 10892 temporary_tables += temporary_tables_tmp 10893 annotation_fields += annotation_fields_tmp 10894 10895 # from column format 10896 column_formats = struct.get("from_column_format", []) 10897 temporary_tables_tmp, annotation_fields_tmp = ( 10898 self.create_transcript_view_from_column_format( 10899 transcripts_table=transcripts_table, 10900 column_formats=column_formats, 10901 temporary_tables=temporary_tables, 10902 annotation_fields=annotation_fields, 10903 ) 10904 ) 10905 temporary_tables += temporary_tables_tmp 10906 annotation_fields += annotation_fields_tmp 10907 10908 # Remove some specific fields/column 10909 annotation_fields = list(set(annotation_fields)) 10910 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10911 if field in annotation_fields: 10912 annotation_fields.remove(field) 10913 10914 # Merge temporary tables query 10915 query_merge = "" 10916 for temporary_table in list(set(temporary_tables)): 10917 10918 # First temporary table 10919 if not query_merge: 10920 query_merge = f""" 10921 SELECT * FROM {temporary_table} 10922 """ 10923 # other temporary table (using UNION) 10924 else: 10925 query_merge += f""" 10926 UNION BY NAME SELECT * FROM {temporary_table} 10927 """ 10928 10929 # transcript table tmp 10930 transcript_table_tmp = "transcripts_tmp" 10931 transcript_table_tmp2 = "transcripts_tmp2" 10932 transcript_table_tmp3 = "transcripts_tmp3" 10933 10934 # Merge on transcript 10935 query_merge_on_transcripts_annotation_fields = [] 10936 10937 # Add transcript list 10938 query_merge_on_transcripts_annotation_fields.append( 10939 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10940 ) 10941 10942 # Aggregate all annotations fields 10943 for annotation_field in set(annotation_fields): 10944 query_merge_on_transcripts_annotation_fields.append( 10945 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10946 ) 10947 10948 # Transcripts mapping 10949 if transcript_id_mapping_file: 10950 10951 # Transcript dataframe 10952 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10953 transcript_id_mapping_dataframe = transcripts_file_to_df( 10954 transcript_id_mapping_file, column_names=["transcript", "alias"] 10955 ) 10956 10957 # Transcript version remove 10958 if transcript_id_remove_version: 10959 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10960 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10961 query_left_join = f""" 10962 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10963 """ 10964 else: 10965 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10966 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10967 query_left_join = f""" 10968 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10969 """ 10970 10971 # Transcript column for group by merge 10972 query_transcript_merge_group_by = """ 10973 CASE 10974 WHEN transcript_mapped NOT IN ('') 10975 THEN split_part(transcript_mapped, '.', 1) 10976 ELSE split_part(transcript_original, '.', 1) 10977 END 10978 """ 10979 10980 # Merge query 10981 transcripts_tmp2_query = f""" 10982 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10983 FROM ({query_merge}) AS {transcript_table_tmp} 10984 {query_left_join} 10985 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10986 """ 10987 10988 # Retrive columns after mege 10989 transcripts_tmp2_describe_query = f""" 10990 DESCRIBE {transcripts_tmp2_query} 10991 """ 10992 transcripts_tmp2_describe_list = list( 10993 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10994 "column_name" 10995 ] 10996 ) 10997 10998 # Create list of columns for select clause 10999 transcripts_tmp2_describe_select_clause = [] 11000 for field in transcripts_tmp2_describe_list: 11001 if field not in [ 11002 "#CHROM", 11003 "POS", 11004 "REF", 11005 "ALT", 11006 "INFO", 11007 "transcript_mapped", 11008 ]: 11009 as_field = field 11010 if field in ["transcript_original"]: 11011 as_field = "transcripts_mapped" 11012 transcripts_tmp2_describe_select_clause.append( 11013 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11014 ) 11015 11016 # Merge with mapping 11017 query_merge_on_transcripts = f""" 11018 SELECT 11019 "#CHROM", POS, REF, ALT, INFO, 11020 CASE 11021 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11022 THEN ANY_VALUE(transcript_mapped) 11023 ELSE ANY_VALUE(transcript_original) 11024 END AS transcript, 11025 {", ".join(transcripts_tmp2_describe_select_clause)} 11026 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11027 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11028 {query_transcript_merge_group_by} 11029 """ 11030 11031 # Add transcript filter from mapping file 11032 if transcript_id_mapping_force: 11033 query_merge_on_transcripts = f""" 11034 SELECT * 11035 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11036 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11037 """ 11038 11039 # No transcript mapping 11040 else: 11041 11042 # Remove transcript version 11043 if transcript_id_remove_version: 11044 query_transcript_column = f""" 11045 split_part({transcript_table_tmp}.transcript, '.', 1) 11046 """ 11047 else: 11048 query_transcript_column = """ 11049 transcript 11050 """ 11051 11052 # Query sections 11053 query_transcript_column_select = ( 11054 f"{query_transcript_column} AS transcript" 11055 ) 11056 query_transcript_column_group_by = query_transcript_column 11057 11058 # Query for transcripts view 11059 query_merge_on_transcripts = f""" 11060 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11061 FROM ({query_merge}) AS {transcript_table_tmp} 11062 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11063 """ 11064 11065 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11066 11067 # Drop transcript view is necessary 11068 if transcripts_table_drop: 11069 query_drop = f""" 11070 DROP TABLE IF EXISTS {transcripts_table}; 11071 """ 11072 self.execute_query(query=query_drop) 11073 11074 # Merge and create transcript view 11075 query_create_view = f""" 11076 CREATE TABLE IF NOT EXISTS {transcripts_table} 11077 AS {query_merge_on_transcripts} 11078 """ 11079 self.execute_query(query=query_create_view) 11080 11081 # Remove added columns 11082 for added_column in added_columns: 11083 self.drop_column(column=added_column) 11084 11085 else: 11086 11087 transcripts_table = None 11088 11089 return transcripts_table 11090 11091 def annotation_format_to_table( 11092 self, 11093 uniquify: bool = True, 11094 annotation_field: str = "ANN", 11095 annotation_id: str = "Feature_ID", 11096 view_name: str = "transcripts", 11097 column_rename: dict = {}, 11098 column_clean: bool = False, 11099 column_case: str = None, 11100 ) -> str: 11101 """ 11102 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11103 structured table format, ensuring unique values and creating a temporary table for further 11104 processing or analysis. 11105 11106 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11107 unique values in the output or not. If set to `True`, the function will make sure that the 11108 output values are unique, defaults to True 11109 :type uniquify: bool (optional) 11110 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11111 that contains the annotation information for each variant. This field is used to extract the 11112 annotation details for further processing in the function. By default, it is set to "ANN", 11113 defaults to ANN 11114 :type annotation_field: str (optional) 11115 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11116 is used to specify the identifier for the annotation feature. This identifier will be used as a 11117 column name in the resulting table or view that is created based on the annotation data. It 11118 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11119 :type annotation_id: str (optional) 11120 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11121 to specify the name of the temporary table that will be created to store the transformed 11122 annotation data. This table will hold the extracted information from the annotation field in a 11123 structured format for further processing or analysis. By default,, defaults to transcripts 11124 :type view_name: str (optional) 11125 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11126 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11127 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11128 created based on the annotation data. This feature enables 11129 :type column_rename: dict 11130 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11131 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11132 If set to `True`, the function will clean the annotation field before further processing. This 11133 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11134 to False 11135 :type column_clean: bool (optional) 11136 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11137 used to specify the case transformation to be applied to the column names extracted from the 11138 annotation data. It allows you to set the case of the column names to either lowercase or 11139 uppercase for consistency or other specific requirements during the conversion 11140 :type column_case: str 11141 :return: The function `annotation_format_to_table` is returning the name of the view created, 11142 which is stored in the variable `view_name`. 11143 """ 11144 11145 # Annotation field 11146 annotation_format = "annotation_explode" 11147 11148 # Transcript annotation 11149 if column_rename: 11150 annotation_id = column_rename.get(annotation_id, annotation_id) 11151 11152 if column_clean: 11153 annotation_id = clean_annotation_field(annotation_id) 11154 11155 # Prefix 11156 prefix = self.get_explode_infos_prefix() 11157 if prefix: 11158 prefix = "INFO/" 11159 11160 # Annotation fields 11161 annotation_infos = prefix + annotation_field 11162 annotation_format_infos = prefix + annotation_format 11163 11164 # Variants table 11165 table_variants = self.get_table_variants() 11166 11167 # Header 11168 vcf_reader = self.get_header() 11169 11170 # Add columns 11171 added_columns = [] 11172 11173 # Explode HGVS field in column 11174 added_columns += self.explode_infos(fields=[annotation_field]) 11175 11176 if annotation_field in vcf_reader.infos: 11177 11178 # Extract ANN header 11179 ann_description = vcf_reader.infos[annotation_field].desc 11180 pattern = r"'(.+?)'" 11181 match = re.search(pattern, ann_description) 11182 if match: 11183 ann_header_match = match.group(1).split(" | ") 11184 ann_header = [] 11185 ann_header_desc = {} 11186 for i in range(len(ann_header_match)): 11187 ann_header_info = "".join( 11188 char for char in ann_header_match[i] if char.isalnum() 11189 ) 11190 ann_header.append(ann_header_info) 11191 ann_header_desc[ann_header_info] = ann_header_match[i] 11192 if not ann_header_desc: 11193 raise ValueError("Invalid header description format") 11194 else: 11195 raise ValueError("Invalid header description format") 11196 11197 # Create variant id 11198 variant_id_column = self.get_variant_id_column() 11199 added_columns += [variant_id_column] 11200 11201 # Create dataframe 11202 dataframe_annotation_format = self.get_query_to_df( 11203 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11204 ) 11205 11206 # Create annotation columns 11207 dataframe_annotation_format[ 11208 annotation_format_infos 11209 ] = dataframe_annotation_format[annotation_infos].apply( 11210 lambda x: explode_annotation_format( 11211 annotation=str(x), 11212 uniquify=uniquify, 11213 output_format="JSON", 11214 prefix="", 11215 header=list(ann_header_desc.values()), 11216 ) 11217 ) 11218 11219 # Find keys 11220 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11221 df_keys = self.get_query_to_df(query=query_json) 11222 11223 # Check keys 11224 query_json_key = [] 11225 for _, row in df_keys.iterrows(): 11226 11227 # Key 11228 key = row.iloc[0] 11229 key_clean = key 11230 11231 # key rename 11232 if column_rename: 11233 key_clean = column_rename.get(key_clean, key_clean) 11234 11235 # key clean 11236 if column_clean: 11237 key_clean = clean_annotation_field(key_clean) 11238 11239 # Key case 11240 if column_case: 11241 if column_case.lower() in ["lower"]: 11242 key_clean = key_clean.lower() 11243 elif column_case.lower() in ["upper"]: 11244 key_clean = key_clean.upper() 11245 11246 # Type 11247 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11248 11249 # Get DataFrame from query 11250 df_json_type = self.get_query_to_df(query=query_json_type) 11251 11252 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11253 with pd.option_context("future.no_silent_downcasting", True): 11254 df_json_type.fillna(value="", inplace=True) 11255 replace_dict = {None: np.nan, "": np.nan} 11256 df_json_type.replace(replace_dict, inplace=True) 11257 df_json_type.dropna(inplace=True) 11258 11259 # Detect column type 11260 column_type = detect_column_type(df_json_type[key_clean]) 11261 11262 # Append 11263 query_json_key.append( 11264 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11265 ) 11266 11267 # Create view 11268 query_view = f""" 11269 CREATE TEMPORARY TABLE {view_name} 11270 AS ( 11271 SELECT *, {annotation_id} AS 'transcript' 11272 FROM ( 11273 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11274 FROM dataframe_annotation_format 11275 ) 11276 ); 11277 """ 11278 self.execute_query(query=query_view) 11279 11280 else: 11281 11282 # Return None 11283 view_name = None 11284 11285 # Remove added columns 11286 for added_column in added_columns: 11287 self.drop_column(column=added_column) 11288 11289 return view_name 11290 11291 def transcript_view_to_variants( 11292 self, 11293 transcripts_table: str = None, 11294 transcripts_column_id: str = None, 11295 transcripts_info_json: str = None, 11296 transcripts_info_field_json: str = None, 11297 transcripts_info_format: str = None, 11298 transcripts_info_field_format: str = None, 11299 param: dict = {}, 11300 ) -> bool: 11301 """ 11302 The `transcript_view_to_variants` function updates a variants table with information from 11303 transcripts in JSON format. 11304 11305 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11306 table containing the transcripts data. If this parameter is not provided, the function will 11307 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11308 :type transcripts_table: str 11309 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11310 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11311 identifier is used to match transcripts with variants in the database 11312 :type transcripts_column_id: str 11313 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11314 of the column in the variants table where the transcripts information will be stored in JSON 11315 format. This parameter allows you to define the column in the variants table that will hold the 11316 JSON-formatted information about transcripts 11317 :type transcripts_info_json: str 11318 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11319 specify the field in the VCF header that will contain information about transcripts in JSON 11320 format. This field will be added to the VCF header as an INFO field with the specified name 11321 :type transcripts_info_field_json: str 11322 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11323 format of the information about transcripts that will be stored in the variants table. This 11324 format can be used to define how the transcript information will be structured or displayed 11325 within the variants table 11326 :type transcripts_info_format: str 11327 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11328 specify the field in the VCF header that will contain information about transcripts in a 11329 specific format. This field will be added to the VCF header as an INFO field with the specified 11330 name 11331 :type transcripts_info_field_format: str 11332 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11333 that contains various configuration settings related to transcripts. It is used to provide 11334 default values for certain parameters if they are not explicitly provided when calling the 11335 method. The `param` dictionary can be passed as an argument 11336 :type param: dict 11337 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11338 if the operation is successful and `False` if certain conditions are not met. 11339 """ 11340 11341 msg_info_prefix = "Start transcripts view to variants annotations" 11342 11343 log.debug(f"{msg_info_prefix}...") 11344 11345 # Default 11346 transcripts_table_default = "transcripts" 11347 transcripts_column_id_default = "transcript" 11348 transcripts_info_json_default = None 11349 transcripts_info_format_default = None 11350 transcripts_info_field_json_default = None 11351 transcripts_info_field_format_default = None 11352 11353 # Param 11354 if not param: 11355 param = self.get_param() 11356 11357 # Transcripts table 11358 if transcripts_table is None: 11359 transcripts_table = param.get("transcripts", {}).get( 11360 "table", transcripts_table_default 11361 ) 11362 11363 # Transcripts column ID 11364 if transcripts_column_id is None: 11365 transcripts_column_id = param.get("transcripts", {}).get( 11366 "column_id", transcripts_column_id_default 11367 ) 11368 11369 # Transcripts info json 11370 if transcripts_info_json is None: 11371 transcripts_info_json = param.get("transcripts", {}).get( 11372 "transcripts_info_json", transcripts_info_json_default 11373 ) 11374 11375 # Transcripts info field JSON 11376 if transcripts_info_field_json is None: 11377 transcripts_info_field_json = param.get("transcripts", {}).get( 11378 "transcripts_info_field_json", transcripts_info_field_json_default 11379 ) 11380 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11381 # transcripts_info_json = transcripts_info_field_json 11382 11383 # Transcripts info format 11384 if transcripts_info_format is None: 11385 transcripts_info_format = param.get("transcripts", {}).get( 11386 "transcripts_info_format", transcripts_info_format_default 11387 ) 11388 11389 # Transcripts info field FORMAT 11390 if transcripts_info_field_format is None: 11391 transcripts_info_field_format = param.get("transcripts", {}).get( 11392 "transcripts_info_field_format", transcripts_info_field_format_default 11393 ) 11394 # if ( 11395 # transcripts_info_field_format is not None 11396 # and transcripts_info_format is None 11397 # ): 11398 # transcripts_info_format = transcripts_info_field_format 11399 11400 # Variants table 11401 table_variants = self.get_table_variants() 11402 11403 # Check info columns param 11404 if ( 11405 transcripts_info_json is None 11406 and transcripts_info_field_json is None 11407 and transcripts_info_format is None 11408 and transcripts_info_field_format is None 11409 ): 11410 return False 11411 11412 # Transcripts infos columns 11413 query_transcripts_infos_columns = f""" 11414 SELECT * 11415 FROM ( 11416 DESCRIBE SELECT * FROM {transcripts_table} 11417 ) 11418 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11419 """ 11420 transcripts_infos_columns = list( 11421 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11422 ) 11423 11424 # View results 11425 clause_select = [] 11426 clause_to_json = [] 11427 clause_to_format = [] 11428 for field in transcripts_infos_columns: 11429 # Do not consider INFO field for export into fields 11430 if field not in ["INFO"]: 11431 clause_select.append( 11432 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11433 ) 11434 clause_to_json.append(f""" '{field}': "{field}" """) 11435 clause_to_format.append(f""" "{field}" """) 11436 11437 # Update 11438 update_set_json = [] 11439 update_set_format = [] 11440 11441 # VCF header 11442 vcf_reader = self.get_header() 11443 11444 # Transcripts to info column in JSON 11445 if transcripts_info_json: 11446 11447 # Create column on variants table 11448 self.add_column( 11449 table_name=table_variants, 11450 column_name=transcripts_info_json, 11451 column_type="JSON", 11452 default_value=None, 11453 drop=False, 11454 ) 11455 11456 # Add header 11457 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11458 transcripts_info_json, 11459 ".", 11460 "String", 11461 "Transcripts in JSON format", 11462 "unknwon", 11463 "unknwon", 11464 self.code_type_map["String"], 11465 ) 11466 11467 # Add to update 11468 update_set_json.append( 11469 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11470 ) 11471 11472 # Transcripts to info field in JSON 11473 if transcripts_info_field_json: 11474 11475 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11476 11477 # Add to update 11478 update_set_json.append( 11479 f""" 11480 INFO = concat( 11481 CASE 11482 WHEN INFO NOT IN ('', '.') 11483 THEN INFO 11484 ELSE '' 11485 END, 11486 CASE 11487 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11488 THEN concat( 11489 ';{transcripts_info_field_json}=', 11490 t.{transcripts_info_json} 11491 ) 11492 ELSE '' 11493 END 11494 ) 11495 """ 11496 ) 11497 11498 # Add header 11499 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11500 transcripts_info_field_json, 11501 ".", 11502 "String", 11503 "Transcripts in JSON format", 11504 "unknwon", 11505 "unknwon", 11506 self.code_type_map["String"], 11507 ) 11508 11509 if update_set_json: 11510 11511 # Update query 11512 query_update = f""" 11513 UPDATE {table_variants} 11514 SET {", ".join(update_set_json)} 11515 FROM 11516 ( 11517 SELECT 11518 "#CHROM", POS, REF, ALT, 11519 concat( 11520 '{{', 11521 string_agg( 11522 '"' || "{transcripts_column_id}" || '":' || 11523 to_json(json_output) 11524 ), 11525 '}}' 11526 )::JSON AS {transcripts_info_json} 11527 FROM 11528 ( 11529 SELECT 11530 "#CHROM", POS, REF, ALT, 11531 "{transcripts_column_id}", 11532 to_json( 11533 {{{",".join(clause_to_json)}}} 11534 )::JSON AS json_output 11535 FROM 11536 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11537 WHERE "{transcripts_column_id}" IS NOT NULL 11538 ) 11539 GROUP BY "#CHROM", POS, REF, ALT 11540 ) AS t 11541 WHERE {table_variants}."#CHROM" = t."#CHROM" 11542 AND {table_variants}."POS" = t."POS" 11543 AND {table_variants}."REF" = t."REF" 11544 AND {table_variants}."ALT" = t."ALT" 11545 """ 11546 11547 self.execute_query(query=query_update) 11548 11549 # Transcripts to info column in FORMAT 11550 if transcripts_info_format: 11551 11552 # Create column on variants table 11553 self.add_column( 11554 table_name=table_variants, 11555 column_name=transcripts_info_format, 11556 column_type="VARCHAR", 11557 default_value=None, 11558 drop=False, 11559 ) 11560 11561 # Add header 11562 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11563 transcripts_info_format, 11564 ".", 11565 "String", 11566 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11567 "unknwon", 11568 "unknwon", 11569 self.code_type_map["String"], 11570 ) 11571 11572 # Add to update 11573 update_set_format.append( 11574 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11575 ) 11576 11577 else: 11578 11579 # Set variable for internal queries 11580 transcripts_info_format = "transcripts_info_format" 11581 11582 # Transcripts to info field in JSON 11583 if transcripts_info_field_format: 11584 11585 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11586 11587 # Add to update 11588 update_set_format.append( 11589 f""" 11590 INFO = concat( 11591 CASE 11592 WHEN INFO NOT IN ('', '.') 11593 THEN INFO 11594 ELSE '' 11595 END, 11596 CASE 11597 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11598 THEN concat( 11599 ';{transcripts_info_field_format}=', 11600 t.{transcripts_info_format} 11601 ) 11602 ELSE '' 11603 END 11604 ) 11605 """ 11606 ) 11607 11608 # Add header 11609 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11610 transcripts_info_field_format, 11611 ".", 11612 "String", 11613 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11614 "unknwon", 11615 "unknwon", 11616 self.code_type_map["String"], 11617 ) 11618 11619 if update_set_format: 11620 11621 # Update query 11622 query_update = f""" 11623 UPDATE {table_variants} 11624 SET {", ".join(update_set_format)} 11625 FROM 11626 ( 11627 SELECT 11628 "#CHROM", POS, REF, ALT, 11629 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11630 FROM 11631 ( 11632 SELECT 11633 "#CHROM", POS, REF, ALT, 11634 "{transcripts_column_id}", 11635 concat( 11636 "{transcripts_column_id}", 11637 '|', 11638 {", '|', ".join(clause_to_format)} 11639 ) AS {transcripts_info_format} 11640 FROM 11641 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11642 ) 11643 GROUP BY "#CHROM", POS, REF, ALT 11644 ) AS t 11645 WHERE {table_variants}."#CHROM" = t."#CHROM" 11646 AND {table_variants}."POS" = t."POS" 11647 AND {table_variants}."REF" = t."REF" 11648 AND {table_variants}."ALT" = t."ALT" 11649 """ 11650 11651 self.execute_query(query=query_update) 11652 11653 return True 11654 11655 def rename_info_fields( 11656 self, fields_to_rename: dict = None, table: str = None 11657 ) -> dict: 11658 """ 11659 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11660 corresponding INFO fields in the variants table. 11661 11662 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11663 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11664 represent the original field names that need to be renamed, and the corresponding values 11665 represent the new names to which the fields should be 11666 :type fields_to_rename: dict 11667 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11668 the table in which the variants data is stored. This table contains information about genetic 11669 variants, and the function updates the corresponding INFO fields in this table when renaming 11670 specified fields in the VCF file header 11671 :type table: str 11672 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11673 the original field names as keys and their corresponding new names (or None if the field was 11674 removed) as values after renaming or removing specified fields in a VCF file header and updating 11675 corresponding INFO fields in the variants table. 11676 """ 11677 11678 # Init 11679 fields_renamed = {} 11680 config = self.get_config() 11681 access = config.get("access") 11682 11683 if table is None: 11684 table = self.get_table_variants() 11685 11686 # regexp replace fonction 11687 regex_replace_dict = {} 11688 regex_replace_nb = 0 11689 regex_replace_partition = 125 11690 regex_replace = "INFO" 11691 11692 if fields_to_rename is not None and access not in ["RO"]: 11693 11694 log.info("Rename or remove fields...") 11695 11696 # Header 11697 header = self.get_header() 11698 11699 for field_to_rename, field_renamed in fields_to_rename.items(): 11700 11701 if field_to_rename in header.infos: 11702 11703 # Rename header 11704 if field_renamed is not None: 11705 header.infos[field_renamed] = vcf.parser._Info( 11706 field_renamed, 11707 header.infos[field_to_rename].num, 11708 header.infos[field_to_rename].type, 11709 header.infos[field_to_rename].desc, 11710 header.infos[field_to_rename].source, 11711 header.infos[field_to_rename].version, 11712 header.infos[field_to_rename].type_code, 11713 ) 11714 del header.infos[field_to_rename] 11715 11716 # Rename INFO patterns 11717 field_pattern = rf'(^|;)({field_to_rename})($|;|=[^;]*)' 11718 if field_renamed is not None: 11719 field_renamed_pattern = rf'\1{field_renamed}\3' 11720 else: 11721 field_renamed_pattern = '' 11722 11723 # regexp replace 11724 regex_replace_nb += 1 11725 regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition) 11726 if (regex_replace_nb % regex_replace_partition) == 0: 11727 regex_replace = "INFO" 11728 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11729 regex_replace_dict[regex_replace_key] = regex_replace 11730 11731 # Return 11732 fields_renamed[field_to_rename] = field_renamed 11733 11734 # Log 11735 if field_renamed is not None: 11736 log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'") 11737 else: 11738 log.info(f"Rename or remove fields - field '{field_to_rename}' removed") 11739 11740 else: 11741 11742 log.warning(f"Rename or remove fields - field '{field_to_rename}' not in header") 11743 11744 11745 # Rename INFO 11746 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11747 log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...") 11748 query = f""" 11749 UPDATE {table} 11750 SET 11751 INFO = {regex_replace} 11752 """ 11753 log.debug(f"query={query}") 11754 self.execute_query(query=query) 11755 11756 return fields_renamed 11757 11758 def calculation_rename_info_fields( 11759 self, 11760 fields_to_rename: dict = None, 11761 table: str = None, 11762 operation_name: str = "RENAME_INFO_FIELDS", 11763 ) -> None: 11764 """ 11765 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11766 fields to rename and table if provided, and then calls another function to rename the fields. 11767 11768 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11769 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11770 the key and the new field name as the value 11771 :type fields_to_rename: dict 11772 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11773 specify the name of the table for which the fields are to be renamed. It is a string type 11774 parameter 11775 :type table: str 11776 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11777 method is a string that specifies the name of the operation being performed. In this context, it 11778 is used as a default value for the operation name if not explicitly provided when calling the 11779 function, defaults to RENAME_INFO_FIELDS 11780 :type operation_name: str (optional) 11781 """ 11782 11783 # Param 11784 param = self.get_param() 11785 11786 # Get param fields to rename 11787 param_fields_to_rename = ( 11788 param.get("calculation", {}) 11789 .get("calculations", {}) 11790 .get(operation_name, {}) 11791 .get("fields_to_rename", None) 11792 ) 11793 11794 # Get param table 11795 param_table = ( 11796 param.get("calculation", {}) 11797 .get("calculations", {}) 11798 .get(operation_name, {}) 11799 .get("table", None) 11800 ) 11801 11802 # Init fields_to_rename 11803 if fields_to_rename is None: 11804 fields_to_rename = param_fields_to_rename 11805 11806 # Init table 11807 if table is None: 11808 table = param_table 11809 11810 renamed_fields = self.rename_info_fields( 11811 fields_to_rename=fields_to_rename, table=table 11812 ) 11813 11814 log.debug(f"renamed_fields:{renamed_fields}")
39 def __init__( 40 self, 41 conn=None, 42 input: str = None, 43 output: str = None, 44 config: dict = {}, 45 param: dict = {}, 46 load: bool = False, 47 ) -> None: 48 """ 49 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 50 header 51 52 :param conn: the connection to the database 53 :param input: the input file 54 :param output: the output file 55 :param config: a dictionary containing the configuration of the model 56 :param param: a dictionary containing the parameters of the model 57 """ 58 59 # Init variables 60 self.init_variables() 61 62 # Input 63 self.set_input(input) 64 65 # Config 66 self.set_config(config) 67 68 # Param 69 self.set_param(param) 70 71 # Output 72 self.set_output(output) 73 74 # connexion 75 self.set_connexion(conn) 76 77 # Header 78 self.set_header() 79 80 # Samples 81 self.set_samples() 82 83 # Load data 84 if load: 85 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
87 def set_samples(self, samples: list = None) -> list: 88 """ 89 The function `set_samples` sets the samples attribute of an object to a provided list or 90 retrieves it from a parameter dictionary. 91 92 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 93 input and sets the `samples` attribute of the class to the provided list. If no samples are 94 provided, it tries to get the samples from the class's parameters using the `get_param` method 95 :type samples: list 96 :return: The `samples` list is being returned. 97 """ 98 99 if not samples: 100 samples = self.get_param().get("samples", {}).get("list", None) 101 102 self.samples = samples 103 104 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
106 def get_samples(self) -> list: 107 """ 108 This function returns a list of samples. 109 :return: The `get_samples` method is returning the `samples` attribute of the object. 110 """ 111 112 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
114 def get_samples_check(self) -> bool: 115 """ 116 This function returns the value of the "check" key within the "samples" dictionary retrieved 117 from the parameters. 118 :return: The method `get_samples_check` is returning the value of the key "check" inside the 119 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 120 method. If the key "check" is not found, it will return `False`. 121 """ 122 123 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
125 def set_input(self, input: str = None) -> None: 126 """ 127 The function `set_input` takes a file name as input, extracts the name and extension, and sets 128 attributes in the class accordingly. 129 130 :param input: The `set_input` method in the provided code snippet is used to set attributes 131 related to the input file. Here's a breakdown of the parameters and their usage in the method: 132 :type input: str 133 """ 134 135 if input and not isinstance(input, str): 136 try: 137 self.input = input.name 138 except: 139 log.error(f"Input file '{input} in bad format") 140 raise ValueError(f"Input file '{input} in bad format") 141 else: 142 self.input = input 143 144 # Input format 145 if input: 146 input_name, input_extension = os.path.splitext(self.input) 147 self.input_name = input_name 148 self.input_extension = input_extension 149 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
151 def set_config(self, config: dict) -> None: 152 """ 153 The set_config function takes a config object and assigns it as the configuration object for the 154 class. 155 156 :param config: The `config` parameter in the `set_config` function is a dictionary object that 157 contains configuration settings for the class. When you call the `set_config` function with a 158 dictionary object as the argument, it will set that dictionary as the configuration object for 159 the class 160 :type config: dict 161 """ 162 163 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
165 def set_param(self, param: dict) -> None: 166 """ 167 This function sets a parameter object for the class based on the input dictionary. 168 169 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 170 as the `param` attribute of the class instance 171 :type param: dict 172 """ 173 174 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
176 def init_variables(self) -> None: 177 """ 178 This function initializes the variables that will be used in the rest of the class 179 """ 180 181 self.prefix = "howard" 182 self.table_variants = "variants" 183 self.dataframe = None 184 185 self.comparison_map = { 186 "gt": ">", 187 "gte": ">=", 188 "lt": "<", 189 "lte": "<=", 190 "equals": "=", 191 "contains": "SIMILAR TO", 192 } 193 194 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 195 196 self.code_type_map_to_sql = { 197 "Integer": "INTEGER", 198 "String": "VARCHAR", 199 "Float": "FLOAT", 200 "Flag": "VARCHAR", 201 } 202 203 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
205 def get_indexing(self) -> bool: 206 """ 207 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 208 returns False. 209 :return: The value of the indexing parameter. 210 """ 211 212 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
214 def get_connexion_config(self) -> dict: 215 """ 216 The function `get_connexion_config` returns a dictionary containing the configuration for a 217 connection, including the number of threads and memory limit. 218 :return: a dictionary containing the configuration for the Connexion library. 219 """ 220 221 # config 222 config = self.get_config() 223 224 # Connexion config 225 connexion_config = {} 226 threads = self.get_threads() 227 228 # Threads 229 if threads: 230 connexion_config["threads"] = threads 231 232 # Memory 233 # if config.get("memory", None): 234 # connexion_config["memory_limit"] = config.get("memory") 235 if self.get_memory(): 236 connexion_config["memory_limit"] = self.get_memory() 237 238 # Temporary directory 239 if config.get("tmp", None): 240 connexion_config["temp_directory"] = config.get("tmp") 241 242 # Access 243 if config.get("access", None): 244 access = config.get("access") 245 if access in ["RO"]: 246 access = "READ_ONLY" 247 elif access in ["RW"]: 248 access = "READ_WRITE" 249 connexion_db = self.get_connexion_db() 250 if connexion_db in ":memory:": 251 access = "READ_WRITE" 252 connexion_config["access_mode"] = access 253 254 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
256 def get_duckdb_settings(self) -> dict: 257 """ 258 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 259 string. 260 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 261 """ 262 263 # config 264 config = self.get_config() 265 266 # duckdb settings 267 duckdb_settings_dict = {} 268 if config.get("duckdb_settings", None): 269 duckdb_settings = config.get("duckdb_settings") 270 duckdb_settings = full_path(duckdb_settings) 271 # duckdb setting is a file 272 if os.path.exists(duckdb_settings): 273 with open(duckdb_settings) as json_file: 274 duckdb_settings_dict = yaml.safe_load(json_file) 275 # duckdb settings is a string 276 else: 277 duckdb_settings_dict = json.loads(duckdb_settings) 278 279 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
281 def set_connexion_db(self) -> str: 282 """ 283 The function `set_connexion_db` returns the appropriate database connection string based on the 284 input format and connection type. 285 :return: the value of the variable `connexion_db`. 286 """ 287 288 # Default connexion db 289 default_connexion_db = ":memory:" 290 291 # Find connexion db 292 if self.get_input_format() in ["db", "duckdb"]: 293 connexion_db = self.get_input() 294 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 295 connexion_db = default_connexion_db 296 elif self.get_connexion_type() in ["tmpfile"]: 297 tmp_name = tempfile.mkdtemp( 298 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 299 ) 300 connexion_db = f"{tmp_name}/tmp.db" 301 elif self.get_connexion_type() != "": 302 connexion_db = self.get_connexion_type() 303 else: 304 connexion_db = default_connexion_db 305 306 # Set connexion db 307 self.connexion_db = connexion_db 308 309 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
311 def set_connexion(self, conn) -> None: 312 """ 313 The function `set_connexion` creates a connection to a database, with options for different 314 database formats and settings. 315 316 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 317 database. If a connection is not provided, a new connection to an in-memory database is created. 318 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 319 sqlite 320 """ 321 322 # Connexion db 323 connexion_db = self.set_connexion_db() 324 325 # Connexion config 326 connexion_config = self.get_connexion_config() 327 328 # Connexion format 329 connexion_format = self.get_config().get("connexion_format", "duckdb") 330 # Set connexion format 331 self.connexion_format = connexion_format 332 333 # Connexion 334 if not conn: 335 if connexion_format in ["duckdb"]: 336 conn = duckdb.connect(connexion_db, config=connexion_config) 337 # duckDB settings 338 duckdb_settings = self.get_duckdb_settings() 339 if duckdb_settings: 340 for setting in duckdb_settings: 341 setting_value = duckdb_settings.get(setting) 342 if isinstance(setting_value, str): 343 setting_value = f"'{setting_value}'" 344 conn.execute(f"PRAGMA {setting}={setting_value};") 345 elif connexion_format in ["sqlite"]: 346 conn = sqlite3.connect(connexion_db) 347 348 # Set connexion 349 self.conn = conn 350 351 # Log 352 log.debug(f"connexion_format: {connexion_format}") 353 log.debug(f"connexion_db: {connexion_db}") 354 log.debug(f"connexion config: {connexion_config}") 355 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
357 def set_output(self, output: str = None) -> None: 358 """ 359 The `set_output` function in Python sets the output file based on the input or a specified key 360 in the config file, extracting the output name, extension, and format. 361 362 :param output: The `output` parameter in the `set_output` method is used to specify the name of 363 the output file. If the config file has an 'output' key, the method sets the output to the value 364 of that key. If no output is provided, it sets the output to `None` 365 :type output: str 366 """ 367 368 if output and not isinstance(output, str): 369 self.output = output.name 370 else: 371 self.output = output 372 373 # Output format 374 if self.output: 375 output_name, output_extension = os.path.splitext(self.output) 376 self.output_name = output_name 377 self.output_extension = output_extension 378 self.output_format = self.output_extension.replace(".", "") 379 else: 380 self.output_name = None 381 self.output_extension = None 382 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
384 def set_header(self) -> None: 385 """ 386 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 387 """ 388 389 input_file = self.get_input() 390 default_header_list = [ 391 "##fileformat=VCFv4.2", 392 "#CHROM POS ID REF ALT QUAL FILTER INFO", 393 ] 394 395 # Full path 396 input_file = full_path(input_file) 397 398 if input_file: 399 400 input_format = self.get_input_format() 401 input_compressed = self.get_input_compressed() 402 config = self.get_config() 403 header_list = default_header_list 404 if input_format in [ 405 "vcf", 406 "hdr", 407 "tsv", 408 "csv", 409 "psv", 410 "parquet", 411 "db", 412 "duckdb", 413 ]: 414 # header provided in param 415 if config.get("header_file", None): 416 with open(config.get("header_file"), "rt") as f: 417 header_list = self.read_vcf_header(f) 418 # within a vcf file format (header within input file itsself) 419 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 420 # within a compressed vcf file format (.vcf.gz) 421 if input_compressed: 422 with bgzf.open(input_file, "rt") as f: 423 header_list = self.read_vcf_header(f) 424 # within an uncompressed vcf file format (.vcf) 425 else: 426 with open(input_file, "rt") as f: 427 header_list = self.read_vcf_header(f) 428 # header provided in default external file .hdr 429 elif os.path.exists((input_file + ".hdr")): 430 with open(input_file + ".hdr", "rt") as f: 431 header_list = self.read_vcf_header(f) 432 else: 433 try: # Try to get header info fields and file columns 434 435 with tempfile.TemporaryDirectory() as tmpdir: 436 437 # Create database 438 db_for_header = Database(database=input_file) 439 440 # Get header columns for infos fields 441 db_header_from_columns = ( 442 db_for_header.get_header_from_columns() 443 ) 444 445 # Get real columns in the file 446 db_header_columns = db_for_header.get_columns() 447 448 # Write header file 449 header_file_tmp = os.path.join(tmpdir, "header") 450 f = open(header_file_tmp, "w") 451 vcf.Writer(f, db_header_from_columns) 452 f.close() 453 454 # Replace #CHROM line with rel columns 455 header_list = db_for_header.read_header_file( 456 header_file=header_file_tmp 457 ) 458 header_list[-1] = "\t".join(db_header_columns) 459 460 except: 461 462 log.warning( 463 f"No header for file {input_file}. Set as default VCF header" 464 ) 465 header_list = default_header_list 466 467 else: # try for unknown format ? 468 469 log.error(f"Input file format '{input_format}' not available") 470 raise ValueError(f"Input file format '{input_format}' not available") 471 472 if not header_list: 473 header_list = default_header_list 474 475 # header as list 476 self.header_list = header_list 477 478 # header as VCF object 479 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 480 481 else: 482 483 self.header_list = None 484 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
486 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 487 """ 488 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 489 DataFrame based on the connection format. 490 491 :param query: The `query` parameter in the `get_query_to_df` function is a string that 492 represents the SQL query you want to execute. This query will be used to fetch data from a 493 database and convert it into a pandas DataFrame 494 :type query: str 495 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 496 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 497 function will only fetch up to that number of rows from the database query result. If no limit 498 is specified, 499 :type limit: int 500 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 501 """ 502 503 # Connexion format 504 connexion_format = self.get_connexion_format() 505 506 # Limit in query 507 if limit: 508 pd.set_option("display.max_rows", limit) 509 if connexion_format in ["duckdb"]: 510 df = ( 511 self.conn.execute(query) 512 .fetch_record_batch(limit) 513 .read_next_batch() 514 .to_pandas() 515 ) 516 elif connexion_format in ["sqlite"]: 517 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 518 519 # Full query 520 else: 521 if connexion_format in ["duckdb"]: 522 df = self.conn.execute(query).df() 523 elif connexion_format in ["sqlite"]: 524 df = pd.read_sql_query(query, self.conn) 525 526 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
528 def get_overview(self) -> None: 529 """ 530 The function prints the input, output, config, and dataframe of the current object 531 """ 532 table_variants_from = self.get_table_variants(clause="from") 533 sql_columns = self.get_header_columns_as_sql() 534 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 535 df = self.get_query_to_df(sql_query_export) 536 log.info( 537 "Input: " 538 + str(self.get_input()) 539 + " [" 540 + str(str(self.get_input_format())) 541 + "]" 542 ) 543 log.info( 544 "Output: " 545 + str(self.get_output()) 546 + " [" 547 + str(str(self.get_output_format())) 548 + "]" 549 ) 550 log.info("Config: ") 551 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 552 "\n" 553 ): 554 log.info("\t" + str(d)) 555 log.info("Param: ") 556 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 557 "\n" 558 ): 559 log.info("\t" + str(d)) 560 log.info("Sample list: " + str(self.get_header_sample_list())) 561 log.info("Dataframe: ") 562 for d in str(df).split("\n"): 563 log.info("\t" + str(d)) 564 565 # garbage collector 566 del df 567 gc.collect() 568 569 return None
The function prints the input, output, config, and dataframe of the current object
571 def get_stats(self) -> dict: 572 """ 573 The `get_stats` function calculates and returns various statistics of the current object, 574 including information about the input file, variants, samples, header fields, quality, and 575 SNVs/InDels. 576 :return: a dictionary containing various statistics of the current object. The dictionary has 577 the following structure: 578 """ 579 580 # Log 581 log.info(f"Stats Calculation...") 582 583 # table varaints 584 table_variants_from = self.get_table_variants() 585 586 # stats dict 587 stats = {"Infos": {}} 588 589 ### File 590 input_file = self.get_input() 591 stats["Infos"]["Input file"] = input_file 592 593 # Header 594 header_infos = self.get_header().infos 595 header_formats = self.get_header().formats 596 header_infos_list = list(header_infos) 597 header_formats_list = list(header_formats) 598 599 ### Variants 600 601 stats["Variants"] = {} 602 603 # Variants by chr 604 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 605 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 606 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 607 by=["CHROM"], kind="quicksort" 608 ) 609 610 # Total number of variants 611 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 612 613 # Calculate percentage 614 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 615 lambda x: (x / nb_of_variants) 616 ) 617 618 stats["Variants"]["Number of variants by chromosome"] = ( 619 nb_of_variants_by_chrom.to_dict(orient="index") 620 ) 621 622 stats["Infos"]["Number of variants"] = int(nb_of_variants) 623 624 ### Samples 625 626 # Init 627 samples = {} 628 nb_of_samples = 0 629 630 # Check Samples 631 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 632 log.debug(f"Check samples...") 633 for sample in self.get_header_sample_list(): 634 sql_query_samples = f""" 635 SELECT '{sample}' as sample, 636 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 637 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 638 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 639 FROM {table_variants_from} 640 WHERE ( 641 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 642 AND 643 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 644 ) 645 GROUP BY genotype 646 """ 647 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 648 sample_genotype_count = sql_query_genotype_df["count"].sum() 649 if len(sql_query_genotype_df): 650 nb_of_samples += 1 651 samples[f"{sample} - {sample_genotype_count} variants"] = ( 652 sql_query_genotype_df.to_dict(orient="index") 653 ) 654 655 stats["Samples"] = samples 656 stats["Infos"]["Number of samples"] = nb_of_samples 657 658 # # 659 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 660 # stats["Infos"]["Number of samples"] = nb_of_samples 661 # elif nb_of_samples: 662 # stats["Infos"]["Number of samples"] = "not a VCF format" 663 664 ### INFO and FORMAT fields 665 header_types_df = {} 666 header_types_list = { 667 "List of INFO fields": header_infos, 668 "List of FORMAT fields": header_formats, 669 } 670 i = 0 671 for header_type in header_types_list: 672 673 header_type_infos = header_types_list.get(header_type) 674 header_infos_dict = {} 675 676 for info in header_type_infos: 677 678 i += 1 679 header_infos_dict[i] = {} 680 681 # ID 682 header_infos_dict[i]["id"] = info 683 684 # num 685 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 686 if header_type_infos[info].num in genotype_map.keys(): 687 header_infos_dict[i]["Number"] = genotype_map.get( 688 header_type_infos[info].num 689 ) 690 else: 691 header_infos_dict[i]["Number"] = header_type_infos[info].num 692 693 # type 694 if header_type_infos[info].type: 695 header_infos_dict[i]["Type"] = header_type_infos[info].type 696 else: 697 header_infos_dict[i]["Type"] = "." 698 699 # desc 700 if header_type_infos[info].desc != None: 701 header_infos_dict[i]["Description"] = header_type_infos[info].desc 702 else: 703 header_infos_dict[i]["Description"] = "" 704 705 if len(header_infos_dict): 706 header_types_df[header_type] = pd.DataFrame.from_dict( 707 header_infos_dict, orient="index" 708 ).to_dict(orient="index") 709 710 # Stats 711 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 712 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 713 stats["Header"] = header_types_df 714 715 ### QUAL 716 if "QUAL" in self.get_header_columns(): 717 sql_query_qual = f""" 718 SELECT 719 avg(CAST(QUAL AS INTEGER)) AS Average, 720 min(CAST(QUAL AS INTEGER)) AS Minimum, 721 max(CAST(QUAL AS INTEGER)) AS Maximum, 722 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 723 median(CAST(QUAL AS INTEGER)) AS Median, 724 variance(CAST(QUAL AS INTEGER)) AS Variance 725 FROM {table_variants_from} 726 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 727 """ 728 729 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 730 stats["Quality"] = {"Stats": qual} 731 732 ### SNV and InDel 733 734 sql_query_snv = f""" 735 736 SELECT Type, count FROM ( 737 738 SELECT 739 'Total' AS Type, 740 count(*) AS count 741 FROM {table_variants_from} 742 743 UNION 744 745 SELECT 746 'MNV' AS Type, 747 count(*) AS count 748 FROM {table_variants_from} 749 WHERE len(REF) > 1 AND len(ALT) > 1 750 AND len(REF) = len(ALT) 751 752 UNION 753 754 SELECT 755 'InDel' AS Type, 756 count(*) AS count 757 FROM {table_variants_from} 758 WHERE len(REF) > 1 OR len(ALT) > 1 759 AND len(REF) != len(ALT) 760 761 UNION 762 763 SELECT 764 'SNV' AS Type, 765 count(*) AS count 766 FROM {table_variants_from} 767 WHERE len(REF) = 1 AND len(ALT) = 1 768 769 ) 770 771 ORDER BY count DESC 772 773 """ 774 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 775 776 sql_query_snv_substitution = f""" 777 SELECT 778 concat(REF, '>', ALT) AS 'Substitution', 779 count(*) AS count 780 FROM {table_variants_from} 781 WHERE len(REF) = 1 AND len(ALT) = 1 782 GROUP BY REF, ALT 783 ORDER BY count(*) DESC 784 """ 785 snv_substitution = ( 786 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 787 ) 788 stats["Variants"]["Counts"] = snv_indel 789 stats["Variants"]["Substitutions"] = snv_substitution 790 791 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
793 def stats_to_file(self, file: str = None) -> str: 794 """ 795 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 796 into a JSON object, and writes the JSON object to the specified file. 797 798 :param file: The `file` parameter is a string that represents the file path where the JSON data 799 will be written 800 :type file: str 801 :return: the name of the file that was written to. 802 """ 803 804 # Get stats 805 stats = self.get_stats() 806 807 # Serializing json 808 json_object = json.dumps(stats, indent=4) 809 810 # Writing to sample.json 811 with open(file, "w") as outfile: 812 outfile.write(json_object) 813 814 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
816 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 817 """ 818 The `print_stats` function generates a markdown file and prints the statistics contained in a 819 JSON file in a formatted manner. 820 821 :param output_file: The `output_file` parameter is a string that specifies the path and filename 822 of the output file where the stats will be printed in Markdown format. If no `output_file` is 823 provided, a temporary directory will be created and the stats will be saved in a file named 824 "stats.md" within that 825 :type output_file: str 826 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 827 file where the statistics will be saved. If no value is provided, a temporary directory will be 828 created and a default file name "stats.json" will be used 829 :type json_file: str 830 :return: The function `print_stats` does not return any value. It has a return type annotation 831 of `None`. 832 """ 833 834 # Full path 835 output_file = full_path(output_file) 836 json_file = full_path(json_file) 837 838 with tempfile.TemporaryDirectory() as tmpdir: 839 840 # Files 841 if not output_file: 842 output_file = os.path.join(tmpdir, "stats.md") 843 if not json_file: 844 json_file = os.path.join(tmpdir, "stats.json") 845 846 # Create folders 847 if not os.path.exists(os.path.dirname(output_file)): 848 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 849 if not os.path.exists(os.path.dirname(json_file)): 850 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 851 852 # Create stats JSON file 853 stats_file = self.stats_to_file(file=json_file) 854 855 # Print stats file 856 with open(stats_file) as f: 857 stats = yaml.safe_load(f) 858 859 # Output 860 output_title = [] 861 output_index = [] 862 output = [] 863 864 # Title 865 output_title.append("# HOWARD Stats") 866 867 # Index 868 output_index.append("## Index") 869 870 # Process sections 871 for section in stats: 872 infos = stats.get(section) 873 section_link = "#" + section.lower().replace(" ", "-") 874 output.append(f"## {section}") 875 output_index.append(f"- [{section}]({section_link})") 876 877 if len(infos): 878 for info in infos: 879 try: 880 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 881 is_df = True 882 except: 883 try: 884 df = pd.DataFrame.from_dict( 885 json.loads((infos.get(info))), orient="index" 886 ) 887 is_df = True 888 except: 889 is_df = False 890 if is_df: 891 output.append(f"### {info}") 892 info_link = "#" + info.lower().replace(" ", "-") 893 output_index.append(f" - [{info}]({info_link})") 894 output.append(f"{df.to_markdown(index=False)}") 895 else: 896 output.append(f"- {info}: {infos.get(info)}") 897 else: 898 output.append(f"NA") 899 900 # Write stats in markdown file 901 with open(output_file, "w") as fp: 902 for item in output_title: 903 fp.write("%s\n" % item) 904 for item in output_index: 905 fp.write("%s\n" % item) 906 for item in output: 907 fp.write("%s\n" % item) 908 909 # Output stats in markdown 910 print("") 911 print("\n\n".join(output_title)) 912 print("") 913 print("\n\n".join(output)) 914 print("") 915 916 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
918 def get_input(self) -> str: 919 """ 920 It returns the value of the input variable. 921 :return: The input is being returned. 922 """ 923 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
925 def get_input_format(self, input_file: str = None) -> str: 926 """ 927 This function returns the format of the input variable, either from the provided input file or 928 by prompting for input. 929 930 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 931 represents the file path of the input file. If no `input_file` is provided when calling the 932 method, it will default to `None` 933 :type input_file: str 934 :return: The format of the input variable is being returned. 935 """ 936 937 if not input_file: 938 input_file = self.get_input() 939 input_format = get_file_format(input_file) 940 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
942 def get_input_compressed(self, input_file: str = None) -> str: 943 """ 944 The function `get_input_compressed` returns the format of the input variable after compressing 945 it. 946 947 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 948 that represents the file path of the input file. If no `input_file` is provided when calling the 949 method, it will default to `None` and the method will then call `self.get_input()` to 950 :type input_file: str 951 :return: The function `get_input_compressed` returns the compressed format of the input 952 variable. 953 """ 954 955 if not input_file: 956 input_file = self.get_input() 957 input_compressed = get_file_compressed(input_file) 958 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
960 def get_output(self) -> str: 961 """ 962 It returns the output of the neuron. 963 :return: The output of the neural network. 964 """ 965 966 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
968 def get_output_format(self, output_file: str = None) -> str: 969 """ 970 The function `get_output_format` returns the format of the input variable or the output file if 971 provided. 972 973 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 974 that represents the file path of the output file. If no `output_file` is provided when calling 975 the method, it will default to the output obtained from the `get_output` method of the class 976 instance. The 977 :type output_file: str 978 :return: The format of the input variable is being returned. 979 """ 980 981 if not output_file: 982 output_file = self.get_output() 983 output_format = get_file_format(output_file) 984 985 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
987 def get_config(self) -> dict: 988 """ 989 It returns the config 990 :return: The config variable is being returned. 991 """ 992 return self.config
It returns the config
Returns
The config variable is being returned.
994 def get_param(self) -> dict: 995 """ 996 It returns the param 997 :return: The param variable is being returned. 998 """ 999 return self.param
It returns the param
Returns
The param variable is being returned.
1001 def get_connexion_db(self) -> str: 1002 """ 1003 It returns the connexion_db attribute of the object 1004 :return: The connexion_db is being returned. 1005 """ 1006 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1008 def get_prefix(self) -> str: 1009 """ 1010 It returns the prefix of the object. 1011 :return: The prefix is being returned. 1012 """ 1013 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1015 def get_table_variants(self, clause: str = "select") -> str: 1016 """ 1017 This function returns the table_variants attribute of the object 1018 1019 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1020 defaults to select (optional) 1021 :return: The table_variants attribute of the object. 1022 """ 1023 1024 # Access 1025 access = self.get_config().get("access", None) 1026 1027 # Clauses "select", "where", "update" 1028 if clause in ["select", "where", "update"]: 1029 table_variants = self.table_variants 1030 # Clause "from" 1031 elif clause in ["from"]: 1032 # For Read Only 1033 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1034 input_file = self.get_input() 1035 table_variants = f"'{input_file}' as variants" 1036 # For Read Write 1037 else: 1038 table_variants = f"{self.table_variants} as variants" 1039 else: 1040 table_variants = self.table_variants 1041 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1043 def get_tmp_dir(self) -> str: 1044 """ 1045 The function `get_tmp_dir` returns the temporary directory path based on configuration 1046 parameters or a default path. 1047 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1048 configuration, parameters, and a default value of "/tmp". 1049 """ 1050 1051 return get_tmp( 1052 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1053 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1055 def get_connexion_type(self) -> str: 1056 """ 1057 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1058 1059 :return: The connexion type is being returned. 1060 """ 1061 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1063 def get_connexion(self): 1064 """ 1065 It returns the connection object 1066 1067 :return: The connection object. 1068 """ 1069 return self.conn
It returns the connection object
Returns
The connection object.
1071 def close_connexion(self) -> None: 1072 """ 1073 This function closes the connection to the database. 1074 :return: The connection is being closed. 1075 """ 1076 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1078 def get_header(self, type: str = "vcf"): 1079 """ 1080 This function returns the header of the VCF file as a list of strings 1081 1082 :param type: the type of header you want to get, defaults to vcf (optional) 1083 :return: The header of the vcf file. 1084 """ 1085 1086 if self.header_vcf: 1087 if type == "vcf": 1088 return self.header_vcf 1089 elif type == "list": 1090 return self.header_list 1091 else: 1092 if type == "vcf": 1093 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1094 return header 1095 elif type == "list": 1096 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1098 def get_header_infos_list(self) -> list: 1099 """ 1100 This function retrieves a list of information fields from the header. 1101 :return: A list of information fields from the header. 1102 """ 1103 1104 # Init 1105 infos_list = [] 1106 1107 for field in self.get_header().infos: 1108 infos_list.append(field) 1109 1110 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1112 def get_header_length(self, file: str = None) -> int: 1113 """ 1114 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1115 line. 1116 1117 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1118 header file. If this argument is provided, the function will read the header from the specified 1119 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1120 :type file: str 1121 :return: the length of the header list, excluding the #CHROM line. 1122 """ 1123 1124 if file: 1125 return len(self.read_vcf_header_file(file=file)) - 1 1126 elif self.get_header(type="list"): 1127 return len(self.get_header(type="list")) - 1 1128 else: 1129 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1131 def get_header_columns(self) -> str: 1132 """ 1133 This function returns the header list of a VCF 1134 1135 :return: The length of the header list. 1136 """ 1137 if self.get_header(): 1138 return self.get_header(type="list")[-1] 1139 else: 1140 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1142 def get_header_columns_as_list(self) -> list: 1143 """ 1144 This function returns the header list of a VCF 1145 1146 :return: The length of the header list. 1147 """ 1148 if self.get_header(): 1149 return self.get_header_columns().strip().split("\t") 1150 else: 1151 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1153 def get_header_columns_as_sql(self) -> str: 1154 """ 1155 This function retruns header length (without #CHROM line) 1156 1157 :return: The length of the header list. 1158 """ 1159 sql_column_list = [] 1160 for col in self.get_header_columns_as_list(): 1161 sql_column_list.append(f'"{col}"') 1162 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1164 def get_header_sample_list( 1165 self, check: bool = False, samples: list = None, samples_force: bool = False 1166 ) -> list: 1167 """ 1168 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1169 checking and filtering based on input parameters. 1170 1171 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1172 parameter that determines whether to check if the samples in the list are properly defined as 1173 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1174 list is defined as a, defaults to False 1175 :type check: bool (optional) 1176 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1177 allows you to specify a subset of samples from the header. If you provide a list of sample 1178 names, the function will check if each sample is defined in the header. If a sample is not found 1179 in the 1180 :type samples: list 1181 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1182 a boolean parameter that determines whether to force the function to return the sample list 1183 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1184 function will return the sample list without performing, defaults to False 1185 :type samples_force: bool (optional) 1186 :return: The function `get_header_sample_list` returns a list of samples based on the input 1187 parameters and conditions specified in the function. 1188 """ 1189 1190 # Init 1191 samples_list = [] 1192 1193 if samples is None: 1194 samples_list = self.header_vcf.samples 1195 else: 1196 samples_checked = [] 1197 for sample in samples: 1198 if sample in self.header_vcf.samples: 1199 samples_checked.append(sample) 1200 else: 1201 log.warning(f"Sample '{sample}' not defined in header") 1202 samples_list = samples_checked 1203 1204 # Force sample list without checking if is_genotype_column 1205 if samples_force: 1206 log.warning(f"Samples {samples_list} not checked if genotypes") 1207 return samples_list 1208 1209 if check: 1210 samples_checked = [] 1211 for sample in samples_list: 1212 if self.is_genotype_column(column=sample): 1213 samples_checked.append(sample) 1214 else: 1215 log.warning( 1216 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1217 ) 1218 samples_list = samples_checked 1219 1220 # Return samples list 1221 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1223 def is_genotype_column(self, column: str = None) -> bool: 1224 """ 1225 This function checks if a given column is a genotype column in a database. 1226 1227 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1228 represents the column name in a database table. This method checks if the specified column is a 1229 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1230 method of 1231 :type column: str 1232 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1233 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1234 column name and returns the result. If the `column` parameter is None, it returns False. 1235 """ 1236 1237 if column is not None: 1238 return Database(database=self.get_input()).is_genotype_column(column=column) 1239 else: 1240 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1242 def get_verbose(self) -> bool: 1243 """ 1244 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1245 exist 1246 1247 :return: The value of the key "verbose" in the config dictionary. 1248 """ 1249 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1251 def get_connexion_format(self) -> str: 1252 """ 1253 It returns the connexion format of the object. 1254 :return: The connexion_format is being returned. 1255 """ 1256 connexion_format = self.connexion_format 1257 if connexion_format not in ["duckdb", "sqlite"]: 1258 log.error(f"Unknown connexion format {connexion_format}") 1259 raise ValueError(f"Unknown connexion format {connexion_format}") 1260 else: 1261 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1263 def insert_file_to_table( 1264 self, 1265 file, 1266 columns: str, 1267 header_len: int = 0, 1268 sep: str = "\t", 1269 chunksize: int = 1000000, 1270 ) -> None: 1271 """ 1272 The function reads a file in chunks and inserts each chunk into a table based on the specified 1273 database format. 1274 1275 :param file: The `file` parameter is the file that you want to load into a table. It should be 1276 the path to the file on your system 1277 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1278 should contain the names of the columns in the table where the data will be inserted. The column 1279 names should be separated by commas within the string. For example, if you have columns named 1280 "id", "name 1281 :type columns: str 1282 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1283 the number of lines to skip at the beginning of the file before reading the actual data. This 1284 parameter allows you to skip any header information present in the file before processing the 1285 data, defaults to 0 1286 :type header_len: int (optional) 1287 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1288 separator character that is used in the file being read. In this case, the default separator is 1289 set to `\t`, which represents a tab character. You can change this parameter to a different 1290 separator character if, defaults to \t 1291 :type sep: str (optional) 1292 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1293 when processing the file in chunks. In the provided code snippet, the default value for 1294 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1295 to 1000000 1296 :type chunksize: int (optional) 1297 """ 1298 1299 # Config 1300 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1301 connexion_format = self.get_connexion_format() 1302 1303 log.debug("chunksize: " + str(chunksize)) 1304 1305 if chunksize: 1306 for chunk in pd.read_csv( 1307 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1308 ): 1309 if connexion_format in ["duckdb"]: 1310 sql_insert_into = ( 1311 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1312 ) 1313 self.conn.execute(sql_insert_into) 1314 elif connexion_format in ["sqlite"]: 1315 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1317 def load_data( 1318 self, 1319 input_file: str = None, 1320 drop_variants_table: bool = False, 1321 sample_size: int = 20480, 1322 ) -> None: 1323 """ 1324 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1325 table before loading the data and specify a sample size. 1326 1327 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1328 table 1329 :type input_file: str 1330 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1331 determines whether the variants table should be dropped before loading the data. If set to 1332 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1333 not be dropped, defaults to False 1334 :type drop_variants_table: bool (optional) 1335 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1336 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1337 20480 1338 :type sample_size: int (optional) 1339 """ 1340 1341 log.info("Loading...") 1342 1343 # change input file 1344 if input_file: 1345 self.set_input(input_file) 1346 self.set_header() 1347 1348 # drop variants table 1349 if drop_variants_table: 1350 self.drop_variants_table() 1351 1352 # get table variants 1353 table_variants = self.get_table_variants() 1354 1355 # Access 1356 access = self.get_config().get("access", None) 1357 log.debug(f"access: {access}") 1358 1359 # Input format and compress 1360 input_format = self.get_input_format() 1361 input_compressed = self.get_input_compressed() 1362 log.debug(f"input_format: {input_format}") 1363 log.debug(f"input_compressed: {input_compressed}") 1364 1365 # input_compressed_format 1366 if input_compressed: 1367 input_compressed_format = "gzip" 1368 else: 1369 input_compressed_format = "none" 1370 log.debug(f"input_compressed_format: {input_compressed_format}") 1371 1372 # Connexion format 1373 connexion_format = self.get_connexion_format() 1374 1375 # Sample size 1376 if not sample_size: 1377 sample_size = -1 1378 log.debug(f"sample_size: {sample_size}") 1379 1380 # Load data 1381 log.debug(f"Load Data from {input_format}") 1382 1383 # DuckDB connexion 1384 if connexion_format in ["duckdb"]: 1385 1386 # Database already exists 1387 if self.input_format in ["db", "duckdb"]: 1388 1389 if connexion_format in ["duckdb"]: 1390 log.debug(f"Input file format '{self.input_format}' duckDB") 1391 else: 1392 log.error( 1393 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1394 ) 1395 raise ValueError( 1396 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1397 ) 1398 1399 # Load from existing database format 1400 else: 1401 1402 try: 1403 # Create Table or View 1404 database = Database(database=self.input) 1405 sql_from = database.get_sql_from(sample_size=sample_size) 1406 1407 if access in ["RO"]: 1408 sql_load = ( 1409 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1410 ) 1411 else: 1412 sql_load = ( 1413 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1414 ) 1415 self.conn.execute(sql_load) 1416 1417 except: 1418 # Format not available 1419 log.error(f"Input file format '{self.input_format}' not available") 1420 raise ValueError( 1421 f"Input file format '{self.input_format}' not available" 1422 ) 1423 1424 # SQLite connexion 1425 elif connexion_format in ["sqlite"] and input_format in [ 1426 "vcf", 1427 "tsv", 1428 "csv", 1429 "psv", 1430 ]: 1431 1432 # Main structure 1433 structure = { 1434 "#CHROM": "VARCHAR", 1435 "POS": "INTEGER", 1436 "ID": "VARCHAR", 1437 "REF": "VARCHAR", 1438 "ALT": "VARCHAR", 1439 "QUAL": "VARCHAR", 1440 "FILTER": "VARCHAR", 1441 "INFO": "VARCHAR", 1442 } 1443 1444 # Strcuture with samples 1445 structure_complete = structure 1446 if self.get_header_sample_list(): 1447 structure["FORMAT"] = "VARCHAR" 1448 for sample in self.get_header_sample_list(): 1449 structure_complete[sample] = "VARCHAR" 1450 1451 # Columns list for create and insert 1452 sql_create_table_columns = [] 1453 sql_create_table_columns_list = [] 1454 for column in structure_complete: 1455 column_type = structure_complete[column] 1456 sql_create_table_columns.append( 1457 f'"{column}" {column_type} default NULL' 1458 ) 1459 sql_create_table_columns_list.append(f'"{column}"') 1460 1461 # Create database 1462 log.debug(f"Create Table {table_variants}") 1463 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1464 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1465 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1466 self.conn.execute(sql_create_table) 1467 1468 # chunksize define length of file chunk load file 1469 chunksize = 100000 1470 1471 # delimiter 1472 delimiter = file_format_delimiters.get(input_format, "\t") 1473 1474 # Load the input file 1475 with open(self.input, "rt") as input_file: 1476 1477 # Use the appropriate file handler based on the input format 1478 if input_compressed: 1479 input_file = bgzf.open(self.input, "rt") 1480 if input_format in ["vcf"]: 1481 header_len = self.get_header_length() 1482 else: 1483 header_len = 0 1484 1485 # Insert the file contents into a table 1486 self.insert_file_to_table( 1487 input_file, 1488 columns=sql_create_table_columns_list_sql, 1489 header_len=header_len, 1490 sep=delimiter, 1491 chunksize=chunksize, 1492 ) 1493 1494 else: 1495 log.error( 1496 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1497 ) 1498 raise ValueError( 1499 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1500 ) 1501 1502 # Explode INFOS fields into table fields 1503 if self.get_explode_infos(): 1504 self.explode_infos( 1505 prefix=self.get_explode_infos_prefix(), 1506 fields=self.get_explode_infos_fields(), 1507 force=True, 1508 ) 1509 1510 # Create index after insertion 1511 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1513 def get_explode_infos(self) -> bool: 1514 """ 1515 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1516 to False if it is not set. 1517 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1518 value. If the parameter is not present, it will return False. 1519 """ 1520 1521 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1523 def get_explode_infos_fields( 1524 self, 1525 explode_infos_fields: str = None, 1526 remove_fields_not_in_header: bool = False, 1527 ) -> list: 1528 """ 1529 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1530 the input parameter `explode_infos_fields`. 1531 1532 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1533 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1534 comma-separated list of field names to explode 1535 :type explode_infos_fields: str 1536 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1537 flag that determines whether to remove fields that are not present in the header. If it is set 1538 to `True`, any field that is not in the header will be excluded from the list of exploded 1539 information fields. If it is set to `, defaults to False 1540 :type remove_fields_not_in_header: bool (optional) 1541 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1542 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1543 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1544 Otherwise, it returns a list of exploded information fields after removing any spaces and 1545 splitting the string by commas. 1546 """ 1547 1548 # If no fields, get it in param 1549 if not explode_infos_fields: 1550 explode_infos_fields = ( 1551 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1552 ) 1553 1554 # If no fields, defined as all fields in header using keyword 1555 if not explode_infos_fields: 1556 explode_infos_fields = "*" 1557 1558 # If fields list not empty 1559 if explode_infos_fields: 1560 1561 # Input fields list 1562 if isinstance(explode_infos_fields, str): 1563 fields_input = explode_infos_fields.split(",") 1564 elif isinstance(explode_infos_fields, list): 1565 fields_input = explode_infos_fields 1566 else: 1567 fields_input = [] 1568 1569 # Fields list without * keyword 1570 fields_without_all = fields_input.copy() 1571 if "*".casefold() in (item.casefold() for item in fields_without_all): 1572 fields_without_all.remove("*") 1573 1574 # Fields in header 1575 fields_in_header = sorted(list(set(self.get_header().infos))) 1576 1577 # Construct list of fields 1578 fields_output = [] 1579 for field in fields_input: 1580 1581 # Strip field 1582 field = field.strip() 1583 1584 # format keyword * in regex 1585 if field.upper() in ["*"]: 1586 field = ".*" 1587 1588 # Find all fields with pattern 1589 r = re.compile(field) 1590 fields_search = sorted(list(filter(r.match, fields_in_header))) 1591 1592 # Remove fields input from search 1593 if field in fields_search: 1594 fields_search = [field] 1595 elif fields_search != [field]: 1596 fields_search = sorted( 1597 list(set(fields_search).difference(fields_input)) 1598 ) 1599 1600 # If field is not in header (avoid not well formatted header) 1601 if not fields_search and not remove_fields_not_in_header: 1602 fields_search = [field] 1603 1604 # Add found fields 1605 for new_field in fields_search: 1606 # Add field, if not already exists, and if it is in header (if asked) 1607 if ( 1608 new_field not in fields_output 1609 and ( 1610 not remove_fields_not_in_header 1611 or new_field in fields_in_header 1612 ) 1613 and new_field not in [".*"] 1614 ): 1615 fields_output.append(new_field) 1616 1617 return fields_output 1618 1619 else: 1620 1621 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1623 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1624 """ 1625 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1626 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1627 not provided. 1628 1629 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1630 prefix to be used for exploding or expanding information 1631 :type explode_infos_prefix: str 1632 :return: the value of the variable `explode_infos_prefix`. 1633 """ 1634 1635 if not explode_infos_prefix: 1636 explode_infos_prefix = ( 1637 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1638 ) 1639 1640 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1642 def add_column( 1643 self, 1644 table_name, 1645 column_name, 1646 column_type, 1647 default_value=None, 1648 drop: bool = False, 1649 ) -> dict: 1650 """ 1651 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1652 doesn't already exist. 1653 1654 :param table_name: The name of the table to which you want to add a column 1655 :param column_name: The parameter "column_name" is the name of the column that you want to add 1656 to the table 1657 :param column_type: The `column_type` parameter specifies the data type of the column that you 1658 want to add to the table. It should be a string that represents the desired data type, such as 1659 "INTEGER", "TEXT", "REAL", etc 1660 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1661 default value for the newly added column. If a default value is provided, it will be assigned to 1662 the column for any existing rows that do not have a value for that column 1663 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1664 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1665 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1666 to False 1667 :type drop: bool (optional) 1668 :return: a boolean value indicating whether the column was successfully added to the table. 1669 """ 1670 1671 # added 1672 added = False 1673 dropped = False 1674 1675 # Check if the column already exists in the table 1676 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1677 columns = self.get_query_to_df(query).columns.tolist() 1678 if column_name.upper() in [c.upper() for c in columns]: 1679 log.debug( 1680 f"The {column_name} column already exists in the {table_name} table" 1681 ) 1682 if drop: 1683 self.drop_column(table_name=table_name, column_name=column_name) 1684 dropped = True 1685 else: 1686 return None 1687 else: 1688 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1689 1690 # Add column in table 1691 add_column_query = ( 1692 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1693 ) 1694 if default_value is not None: 1695 add_column_query += f" DEFAULT {default_value}" 1696 self.execute_query(add_column_query) 1697 added = not dropped 1698 log.debug( 1699 f"The {column_name} column was successfully added to the {table_name} table" 1700 ) 1701 1702 if added: 1703 added_column = { 1704 "table_name": table_name, 1705 "column_name": column_name, 1706 "column_type": column_type, 1707 "default_value": default_value, 1708 } 1709 else: 1710 added_column = None 1711 1712 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1714 def drop_column( 1715 self, column: dict = None, table_name: str = None, column_name: str = None 1716 ) -> bool: 1717 """ 1718 The `drop_column` function drops a specified column from a given table in a database and returns 1719 True if the column was successfully dropped, and False if the column does not exist in the 1720 table. 1721 1722 :param column: The `column` parameter is a dictionary that contains information about the column 1723 you want to drop. It has two keys: 1724 :type column: dict 1725 :param table_name: The `table_name` parameter is the name of the table from which you want to 1726 drop a column 1727 :type table_name: str 1728 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1729 from the table 1730 :type column_name: str 1731 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1732 and False if the column does not exist in the table. 1733 """ 1734 1735 # Find column infos 1736 if column: 1737 if isinstance(column, dict): 1738 table_name = column.get("table_name", None) 1739 column_name = column.get("column_name", None) 1740 elif isinstance(column, str): 1741 table_name = self.get_table_variants() 1742 column_name = column 1743 else: 1744 table_name = None 1745 column_name = None 1746 1747 if not table_name and not column_name: 1748 return False 1749 1750 # Removed 1751 removed = False 1752 1753 # Check if the column already exists in the table 1754 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1755 columns = self.get_query_to_df(query).columns.tolist() 1756 if column_name in columns: 1757 log.debug(f"The {column_name} column exists in the {table_name} table") 1758 else: 1759 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1760 return False 1761 1762 # Add column in table # ALTER TABLE integers DROP k 1763 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1764 self.execute_query(add_column_query) 1765 removed = True 1766 log.debug( 1767 f"The {column_name} column was successfully dropped to the {table_name} table" 1768 ) 1769 1770 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1772 def explode_infos( 1773 self, 1774 prefix: str = None, 1775 create_index: bool = False, 1776 fields: list = None, 1777 force: bool = False, 1778 proccess_all_fields_together: bool = False, 1779 table: str = None, 1780 ) -> list: 1781 """ 1782 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1783 individual columns, returning a list of added columns. 1784 1785 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1786 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1787 `self.get_explode_infos_prefix()` as the prefix 1788 :type prefix: str 1789 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1790 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1791 `False`, indexes will not be created. The default value is `False`, defaults to False 1792 :type create_index: bool (optional) 1793 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1794 that you want to explode into individual columns. If this parameter is not provided, all INFO 1795 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1796 a list to the ` 1797 :type fields: list 1798 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1799 determines whether to drop and recreate a column if it already exists in the table. If `force` 1800 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1801 defaults to False 1802 :type force: bool (optional) 1803 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1804 flag that determines whether to process all the INFO fields together or individually. If set to 1805 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1806 be processed individually. The default value is, defaults to False 1807 :type proccess_all_fields_together: bool (optional) 1808 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1809 of the table where the exploded INFO fields will be added as individual columns. If you provide 1810 a value for the `table` parameter, the function will use that table name. If the `table` 1811 parameter is 1812 :type table: str 1813 :return: The `explode_infos` function returns a list of added columns. 1814 """ 1815 1816 # drop indexes 1817 self.drop_indexes() 1818 1819 # connexion format 1820 connexion_format = self.get_connexion_format() 1821 1822 # Access 1823 access = self.get_config().get("access", None) 1824 1825 # Added columns 1826 added_columns = [] 1827 1828 if access not in ["RO"]: 1829 1830 # prefix 1831 if prefix in [None, True] or not isinstance(prefix, str): 1832 if self.get_explode_infos_prefix() not in [None, True]: 1833 prefix = self.get_explode_infos_prefix() 1834 else: 1835 prefix = "INFO/" 1836 1837 # table variants 1838 if table is not None: 1839 table_variants = table 1840 else: 1841 table_variants = self.get_table_variants(clause="select") 1842 1843 # extra infos 1844 try: 1845 extra_infos = self.get_extra_infos() 1846 except: 1847 extra_infos = [] 1848 1849 # Header infos 1850 header_infos = self.get_header().infos 1851 1852 log.debug( 1853 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1854 ) 1855 1856 sql_info_alter_table_array = [] 1857 1858 # Info fields to check 1859 fields_list = list(header_infos) 1860 if fields: 1861 fields_list += fields 1862 fields_list = set(fields_list) 1863 1864 # If no fields 1865 if not fields: 1866 fields = [] 1867 1868 # Translate fields if patterns 1869 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1870 1871 for info in fields: 1872 1873 info_id_sql = prefix + info 1874 1875 if ( 1876 info in fields_list 1877 or prefix + info in fields_list 1878 or info in extra_infos 1879 ): 1880 1881 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1882 1883 if info in header_infos: 1884 info_type = header_infos[info].type 1885 info_num = header_infos[info].num 1886 else: 1887 info_type = "String" 1888 info_num = 0 1889 1890 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1891 if info_num != 1: 1892 type_sql = "VARCHAR" 1893 1894 # Add field 1895 added_column = self.add_column( 1896 table_name=table_variants, 1897 column_name=info_id_sql, 1898 column_type=type_sql, 1899 default_value="null", 1900 drop=force, 1901 ) 1902 1903 if added_column: 1904 added_columns.append(added_column) 1905 1906 if added_column or force: 1907 1908 # add field to index 1909 self.index_additionnal_fields.append(info_id_sql) 1910 1911 # Update field array 1912 if connexion_format in ["duckdb"]: 1913 update_info_field = f""" 1914 "{info_id_sql}" = 1915 CASE 1916 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1917 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1918 END 1919 """ 1920 elif connexion_format in ["sqlite"]: 1921 update_info_field = f""" 1922 "{info_id_sql}" = 1923 CASE 1924 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1925 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1926 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1927 END 1928 """ 1929 1930 sql_info_alter_table_array.append(update_info_field) 1931 1932 if sql_info_alter_table_array: 1933 1934 # By chromosomes 1935 try: 1936 chromosomes_list = list( 1937 self.get_query_to_df( 1938 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1939 )["#CHROM"] 1940 ) 1941 except: 1942 chromosomes_list = [None] 1943 1944 for chrom in chromosomes_list: 1945 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1946 1947 # Where clause 1948 where_clause = "" 1949 if chrom and len(chromosomes_list) > 1: 1950 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1951 1952 # Update table 1953 if proccess_all_fields_together: 1954 sql_info_alter_table_array_join = ", ".join( 1955 sql_info_alter_table_array 1956 ) 1957 if sql_info_alter_table_array_join: 1958 sql_info_alter_table = f""" 1959 UPDATE {table_variants} 1960 SET {sql_info_alter_table_array_join} 1961 {where_clause} 1962 """ 1963 log.debug( 1964 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1965 ) 1966 # log.debug(sql_info_alter_table) 1967 self.conn.execute(sql_info_alter_table) 1968 else: 1969 sql_info_alter_num = 0 1970 for sql_info_alter in sql_info_alter_table_array: 1971 sql_info_alter_num += 1 1972 sql_info_alter_table = f""" 1973 UPDATE {table_variants} 1974 SET {sql_info_alter} 1975 {where_clause} 1976 """ 1977 log.debug( 1978 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1979 ) 1980 # log.debug(sql_info_alter_table) 1981 self.conn.execute(sql_info_alter_table) 1982 1983 # create indexes 1984 if create_index: 1985 self.create_indexes() 1986 1987 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1989 def create_indexes(self) -> None: 1990 """ 1991 Create indexes on the table after insertion 1992 """ 1993 1994 # Access 1995 access = self.get_config().get("access", None) 1996 1997 # get table variants 1998 table_variants = self.get_table_variants("FROM") 1999 2000 if self.get_indexing() and access not in ["RO"]: 2001 # Create index 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2009 self.conn.execute(sql_create_table_index) 2010 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2011 self.conn.execute(sql_create_table_index) 2012 for field in self.index_additionnal_fields: 2013 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2014 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2016 def drop_indexes(self) -> None: 2017 """ 2018 Create indexes on the table after insertion 2019 """ 2020 2021 # Access 2022 access = self.get_config().get("access", None) 2023 2024 # get table variants 2025 table_variants = self.get_table_variants("FROM") 2026 2027 # Get database format 2028 connexion_format = self.get_connexion_format() 2029 2030 if access not in ["RO"]: 2031 if connexion_format in ["duckdb"]: 2032 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2033 elif connexion_format in ["sqlite"]: 2034 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2035 2036 list_indexes = self.conn.execute(sql_list_indexes) 2037 index_names = [row[0] for row in list_indexes.fetchall()] 2038 for index in index_names: 2039 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2040 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2042 def read_vcf_header(self, f) -> list: 2043 """ 2044 It reads the header of a VCF file and returns a list of the header lines 2045 2046 :param f: the file object 2047 :return: The header lines of the VCF file. 2048 """ 2049 2050 header_list = [] 2051 for line in f: 2052 header_list.append(line) 2053 if line.startswith("#CHROM"): 2054 break 2055 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2057 def read_vcf_header_file(self, file: str = None) -> list: 2058 """ 2059 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2060 uncompressed files. 2061 2062 :param file: The `file` parameter is a string that represents the path to the VCF header file 2063 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2064 default to `None` 2065 :type file: str 2066 :return: The function `read_vcf_header_file` returns a list. 2067 """ 2068 2069 if self.get_input_compressed(input_file=file): 2070 with bgzf.open(file, "rt") as f: 2071 return self.read_vcf_header(f=f) 2072 else: 2073 with open(file, "rt") as f: 2074 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2076 def execute_query(self, query: str): 2077 """ 2078 It takes a query as an argument, executes it, and returns the results 2079 2080 :param query: The query to be executed 2081 :return: The result of the query is being returned. 2082 """ 2083 if query: 2084 return self.conn.execute(query) # .fetchall() 2085 else: 2086 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2088 def export_output( 2089 self, 2090 output_file: str | None = None, 2091 output_header: str | None = None, 2092 export_header: bool = True, 2093 query: str | None = None, 2094 parquet_partitions: list | None = None, 2095 chunk_size: int | None = None, 2096 threads: int | None = None, 2097 sort: bool = False, 2098 index: bool = False, 2099 order_by: str | None = None, 2100 fields_to_rename: dict | None = None 2101 ) -> bool: 2102 """ 2103 The `export_output` function exports data from a VCF file to various formats, including VCF, 2104 CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and 2105 partitioning. 2106 2107 :param output_file: The `output_file` parameter is a string that specifies the name of the 2108 output file where the exported data will be saved 2109 :type output_file: str | None 2110 :param output_header: The `output_header` parameter is a string that specifies the name of the 2111 file where the header of the VCF file will be exported. If this parameter is not provided, the 2112 header will be exported to a file with the same name as the `output_file` parameter, but with 2113 the extension " 2114 :type output_header: str | None 2115 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2116 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2117 True, the header will be exported to a file. If `export_header` is False, the header will not 2118 be, defaults to True 2119 :type export_header: bool (optional) 2120 :param query: The `query` parameter in the `export_output` function is an optional SQL query 2121 that can be used to filter and select specific data from the VCF file before exporting it. If 2122 provided, only the data that matches the query will be exported. This allows you to customize 2123 the exported data based on 2124 :type query: str | None 2125 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2126 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2127 organize data in a hierarchical directory structure based on the values of one or more columns. 2128 This can improve query performance when working with large datasets 2129 :type parquet_partitions: list | None 2130 :param chunk_size: The `chunk_size` parameter specifies the number of records in a batch when 2131 exporting data in Parquet format. This parameter is used for partitioning the Parquet file into 2132 multiple files. It helps in optimizing the export process by breaking down the data into 2133 manageable chunks for processing and storage 2134 :type chunk_size: int | None 2135 :param threads: The `threads` parameter in the `export_output` function specifies the number of 2136 threads to be used during the export process. It determines the level of parallelism and can 2137 improve the performance of the export operation. If this parameter is not provided, the function 2138 will use the default number of threads 2139 :type threads: int | None 2140 :param sort: The `sort` parameter in the `export_output` function is a boolean flag that 2141 determines whether the output file should be sorted based on genomic coordinates of the 2142 variants. If `sort` is set to `True`, the output file will be sorted. If `sort` is set to 2143 `False`,, defaults to False 2144 :type sort: bool (optional) 2145 :param index: The `index` parameter in the `export_output` function is a boolean flag that 2146 determines whether an index should be created on the output file. If `index` is set to `True`, 2147 an index will be created on the output file. If `index` is set to `False`, no, defaults to False 2148 :type index: bool (optional) 2149 :param order_by: The `order_by` parameter in the `export_output` function is a string that 2150 specifies the column(s) to use for sorting the output file. This parameter is only applicable 2151 when exporting data in VCF format. It allows you to specify the column(s) based on which the 2152 output file should be 2153 :type order_by: str | None 2154 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that specifies the 2155 mapping of field names to be renamed during the export process. This parameter allows you to 2156 customize the output field names before exporting the data. Each key-value pair in the 2157 dictionary represents the original field name as the key and the new field name 2158 :type fields_to_rename: dict | None 2159 :return: The `export_output` function returns a boolean value. It checks if the output file 2160 exists and returns True if it does, or None if it doesn't. 2161 """ 2162 2163 # Log 2164 log.info("Exporting...") 2165 2166 # Full path 2167 output_file = full_path(output_file) 2168 output_header = full_path(output_header) 2169 2170 # Config 2171 config = self.get_config() 2172 2173 # Param 2174 param = self.get_param() 2175 2176 # Tmp files to remove 2177 tmp_to_remove = [] 2178 2179 # If no output, get it 2180 if not output_file: 2181 output_file = self.get_output() 2182 2183 # If not threads 2184 if not threads: 2185 threads = self.get_threads() 2186 2187 # Rename fields 2188 if not fields_to_rename: 2189 fields_to_rename = param.get("export", {}).get("fields_to_rename", None) 2190 self.rename_info_fields(fields_to_rename=fields_to_rename) 2191 2192 # Auto header name with extension 2193 if export_header or output_header: 2194 if not output_header: 2195 output_header = f"{output_file}.hdr" 2196 # Export header 2197 self.export_header(output_file=output_file) 2198 2199 # Switch off export header if VCF output 2200 output_file_type = get_file_format(output_file) 2201 if output_file_type in ["vcf"]: 2202 export_header = False 2203 tmp_to_remove.append(output_header) 2204 2205 # Chunk size 2206 if not chunk_size: 2207 chunk_size = config.get("chunk_size", None) 2208 2209 # Parquet partition 2210 if not parquet_partitions: 2211 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2212 if parquet_partitions and isinstance(parquet_partitions, str): 2213 parquet_partitions = parquet_partitions.split(",") 2214 2215 # Order by 2216 if not order_by: 2217 order_by = param.get("export", {}).get("order_by", "") 2218 2219 # Header in output 2220 header_in_output = param.get("export", {}).get("include_header", False) 2221 2222 # Database 2223 database_source = self.get_connexion() 2224 2225 # Connexion format 2226 connexion_format = self.get_connexion_format() 2227 2228 # Explode infos 2229 if self.get_explode_infos(): 2230 self.explode_infos( 2231 prefix=self.get_explode_infos_prefix(), 2232 fields=self.get_explode_infos_fields(), 2233 force=False, 2234 ) 2235 2236 # if connexion_format in ["sqlite"] or query: 2237 if connexion_format in ["sqlite"]: 2238 2239 # Export in Parquet 2240 random_tmp = "".join( 2241 random.choice(string.ascii_lowercase) for i in range(10) 2242 ) 2243 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2244 tmp_to_remove.append(database_source) 2245 2246 # Table Variants 2247 table_variants = self.get_table_variants() 2248 2249 # Create export query 2250 sql_query_export_subquery = f""" 2251 SELECT * FROM {table_variants} 2252 """ 2253 2254 # Write source file 2255 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2256 2257 # Create database 2258 database = Database( 2259 database=database_source, 2260 table="variants", 2261 header_file=output_header, 2262 conn_config=self.get_connexion_config(), 2263 ) 2264 2265 # Existing colomns header 2266 existing_columns_header = database.get_header_columns_from_database(query=query) 2267 2268 # Sample list 2269 if output_file_type in ["vcf"]: 2270 get_samples = self.get_samples() 2271 get_samples_check = self.get_samples_check() 2272 samples_force = get_samples is not None 2273 sample_list = self.get_header_sample_list( 2274 check=get_samples_check, 2275 samples=get_samples, 2276 samples_force=samples_force, 2277 ) 2278 else: 2279 sample_list = None 2280 2281 # Export file 2282 database.export( 2283 output_database=output_file, 2284 output_header=output_header, 2285 existing_columns_header=existing_columns_header, 2286 parquet_partitions=parquet_partitions, 2287 chunk_size=chunk_size, 2288 threads=threads, 2289 sort=sort, 2290 index=index, 2291 header_in_output=header_in_output, 2292 order_by=order_by, 2293 query=query, 2294 export_header=export_header, 2295 sample_list=sample_list, 2296 ) 2297 2298 # Remove 2299 remove_if_exists(tmp_to_remove) 2300 2301 return (os.path.exists(output_file) or None) and ( 2302 os.path.exists(output_file) or None 2303 )
The export_output function exports data from a VCF file to various formats, including VCF,
CSV, TSV, PSV, and Parquet, with options for customization such as filtering, sorting, and
partitioning.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True - query: The
queryparameter in theexport_outputfunction is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported. This allows you to customize the exported data based on - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in a batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. It helps in optimizing the export process by breaking down the data into manageable chunks for processing and storage - threads: The
threadsparameter in theexport_outputfunction specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If this parameter is not provided, the function will use the default number of threads - sort: The
sortparameter in theexport_outputfunction is a boolean flag that determines whether the output file should be sorted based on genomic coordinates of the variants. Ifsortis set toTrue, the output file will be sorted. Ifsortis set toFalse,, defaults to False - index: The
indexparameter in theexport_outputfunction is a boolean flag that determines whether an index should be created on the output file. Ifindexis set toTrue, an index will be created on the output file. Ifindexis set toFalse, no, defaults to False - order_by: The
order_byparameter in theexport_outputfunction is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format. It allows you to specify the column(s) based on which the output file should be - fields_to_rename: The
fields_to_renameparameter is a dictionary that specifies the mapping of field names to be renamed during the export process. This parameter allows you to customize the output field names before exporting the data. Each key-value pair in the dictionary represents the original field name as the key and the new field name
Returns
The
export_outputfunction returns a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2305 def get_extra_infos(self, table: str = None) -> list: 2306 """ 2307 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2308 in the header. 2309 2310 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2311 name of the table from which you want to retrieve the extra columns that are not present in the 2312 header. If the `table` parameter is not provided when calling the function, it will default to 2313 using the variants 2314 :type table: str 2315 :return: A list of columns that are in the specified table but not in the header of the table. 2316 """ 2317 2318 header_columns = [] 2319 2320 if not table: 2321 table = self.get_table_variants(clause="from") 2322 header_columns = self.get_header_columns() 2323 2324 # Check all columns in the database 2325 query = f""" SELECT * FROM {table} LIMIT 1 """ 2326 log.debug(f"query {query}") 2327 table_columns = self.get_query_to_df(query).columns.tolist() 2328 extra_columns = [] 2329 2330 # Construct extra infos (not in header) 2331 for column in table_columns: 2332 if column not in header_columns: 2333 extra_columns.append(column) 2334 2335 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2337 def get_extra_infos_sql(self, table: str = None) -> str: 2338 """ 2339 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2340 by double quotes 2341 2342 :param table: The name of the table to get the extra infos from. If None, the default table is 2343 used 2344 :type table: str 2345 :return: A string of the extra infos 2346 """ 2347 2348 return ", ".join( 2349 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2350 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2352 def export_header( 2353 self, 2354 header_name: str = None, 2355 output_file: str = None, 2356 output_file_ext: str = ".hdr", 2357 clean_header: bool = True, 2358 remove_chrom_line: bool = False, 2359 ) -> str: 2360 """ 2361 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2362 specified options, and writes it to a new file. 2363 2364 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2365 this parameter is not specified, the header will be written to the output file 2366 :type header_name: str 2367 :param output_file: The `output_file` parameter in the `export_header` function is used to 2368 specify the name of the output file where the header will be written. If this parameter is not 2369 provided, the header will be written to a temporary file 2370 :type output_file: str 2371 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2372 string that represents the extension of the output header file. By default, it is set to ".hdr" 2373 if not specified by the user. This extension will be appended to the `output_file` name to 2374 create the final, defaults to .hdr 2375 :type output_file_ext: str (optional) 2376 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2377 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2378 `True`, the function will clean the header by modifying certain lines based on a specific 2379 pattern. If `clean_header`, defaults to True 2380 :type clean_header: bool (optional) 2381 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2382 boolean flag that determines whether the #CHROM line should be removed from the header before 2383 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2384 defaults to False 2385 :type remove_chrom_line: bool (optional) 2386 :return: The function `export_header` returns the name of the temporary header file that is 2387 created. 2388 """ 2389 2390 if not header_name and not output_file: 2391 output_file = self.get_output() 2392 2393 if self.get_header(): 2394 2395 # Get header object 2396 header_obj = self.get_header() 2397 2398 # Create database 2399 db_for_header = Database(database=self.get_input()) 2400 2401 # Get real columns in the file 2402 db_header_columns = db_for_header.get_columns() 2403 2404 with tempfile.TemporaryDirectory() as tmpdir: 2405 2406 # Write header file 2407 header_file_tmp = os.path.join(tmpdir, "header") 2408 f = open(header_file_tmp, "w") 2409 vcf.Writer(f, header_obj) 2410 f.close() 2411 2412 # Replace #CHROM line with rel columns 2413 header_list = db_for_header.read_header_file( 2414 header_file=header_file_tmp 2415 ) 2416 header_list[-1] = "\t".join(db_header_columns) 2417 2418 # Remove CHROM line 2419 if remove_chrom_line: 2420 header_list.pop() 2421 2422 # Clean header 2423 if clean_header: 2424 header_list_clean = [] 2425 for head in header_list: 2426 # Clean head for malformed header 2427 head_clean = head 2428 head_clean = re.subn( 2429 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2430 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2431 head_clean, 2432 2, 2433 )[0] 2434 # Write header 2435 header_list_clean.append(head_clean) 2436 header_list = header_list_clean 2437 2438 tmp_header_name = output_file + output_file_ext 2439 2440 f = open(tmp_header_name, "w") 2441 for line in header_list: 2442 f.write(line) 2443 f.close() 2444 2445 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2447 def export_variant_vcf( 2448 self, 2449 vcf_file, 2450 remove_info: bool = False, 2451 add_samples: bool = True, 2452 list_samples: list = [], 2453 where_clause: str = "", 2454 index: bool = False, 2455 threads: int | None = None, 2456 ) -> bool | None: 2457 """ 2458 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2459 remove INFO field, add samples, and control compression and indexing. 2460 2461 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2462 written to. It is the output file that will contain the filtered VCF data based on the specified 2463 parameters 2464 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2465 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2466 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2467 in, defaults to False 2468 :type remove_info: bool (optional) 2469 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2470 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2471 If set to False, the samples will be removed. The default value is True, defaults to True 2472 :type add_samples: bool (optional) 2473 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2474 in the output VCF file. By default, all samples will be included. If you provide a list of 2475 samples, only those samples will be included in the output file 2476 :type list_samples: list 2477 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2478 determines whether or not to create an index for the output VCF file. If `index` is set to 2479 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2480 :type index: bool (optional) 2481 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2482 number of threads to use for exporting the VCF file. It determines how many parallel threads 2483 will be used during the export process. More threads can potentially speed up the export process 2484 by utilizing multiple cores of the processor. If 2485 :type threads: int | None 2486 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2487 method with various parameters including the output file, query, threads, sort flag, and index 2488 flag. The `export_output` method is responsible for exporting the VCF data based on the 2489 specified parameters and configurations provided in the `export_variant_vcf` function. 2490 """ 2491 2492 # Config 2493 config = self.get_config() 2494 2495 # Extract VCF 2496 log.debug("Export VCF...") 2497 2498 # Table variants 2499 table_variants = self.get_table_variants() 2500 2501 # Threads 2502 if not threads: 2503 threads = self.get_threads() 2504 2505 # Info fields 2506 if remove_info: 2507 if not isinstance(remove_info, str): 2508 remove_info = "." 2509 info_field = f"""'{remove_info}' as INFO""" 2510 else: 2511 info_field = "INFO" 2512 2513 # Samples fields 2514 if add_samples: 2515 if not list_samples: 2516 list_samples = self.get_header_sample_list() 2517 if list_samples: 2518 samples_fields = " , FORMAT , " + " , ".join( 2519 [f""" "{sample}" """ for sample in list_samples] 2520 ) 2521 else: 2522 samples_fields = "" 2523 log.debug(f"samples_fields: {samples_fields}") 2524 else: 2525 samples_fields = "" 2526 2527 # Where clause 2528 if where_clause is None: 2529 where_clause = "" 2530 2531 # Variants 2532 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2533 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2534 log.debug(f"sql_query_select={sql_query_select}") 2535 2536 return self.export_output( 2537 output_file=vcf_file, 2538 output_header=None, 2539 export_header=True, 2540 query=sql_query_select, 2541 parquet_partitions=None, 2542 chunk_size=config.get("chunk_size", None), 2543 threads=threads, 2544 sort=True, 2545 index=index, 2546 order_by=None, 2547 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2549 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2550 """ 2551 It takes a list of commands and runs them in parallel using the number of threads specified 2552 2553 :param commands: A list of commands to run 2554 :param threads: The number of threads to use, defaults to 1 (optional) 2555 """ 2556 2557 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2559 def get_threads(self, default: int = 1) -> int: 2560 """ 2561 This function returns the number of threads to use for a job, with a default value of 1 if not 2562 specified. 2563 2564 :param default: The `default` parameter in the `get_threads` method is used to specify the 2565 default number of threads to use if no specific value is provided. If no value is provided for 2566 the `threads` parameter in the configuration or input parameters, the `default` value will be 2567 used, defaults to 1 2568 :type default: int (optional) 2569 :return: the number of threads to use for the current job. 2570 """ 2571 2572 # Config 2573 config = self.get_config() 2574 2575 # Param 2576 param = self.get_param() 2577 2578 # Input threads 2579 input_thread = param.get("threads", config.get("threads", None)) 2580 2581 # Check threads 2582 if not input_thread: 2583 threads = default 2584 elif int(input_thread) <= 0: 2585 threads = os.cpu_count() 2586 else: 2587 threads = int(input_thread) 2588 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2590 def get_memory(self, default: str = None) -> str: 2591 """ 2592 This function retrieves the memory value from parameters or configuration with a default value 2593 if not found. 2594 2595 :param default: The `get_memory` function takes in a default value as a string parameter. This 2596 default value is used as a fallback in case the `memory` parameter is not provided in the 2597 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2598 the function 2599 :type default: str 2600 :return: The `get_memory` function returns a string value representing the memory parameter. If 2601 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2602 return the default value provided as an argument to the function. 2603 """ 2604 2605 # Config 2606 config = self.get_config() 2607 2608 # Param 2609 param = self.get_param() 2610 2611 # Input threads 2612 input_memory = param.get("memory", config.get("memory", None)) 2613 2614 # Check threads 2615 if input_memory: 2616 memory = input_memory 2617 else: 2618 memory = default 2619 2620 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2622 def update_from_vcf(self, vcf_file: str) -> None: 2623 """ 2624 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2625 2626 :param vcf_file: the path to the VCF file 2627 """ 2628 2629 connexion_format = self.get_connexion_format() 2630 2631 if connexion_format in ["duckdb"]: 2632 self.update_from_vcf_duckdb(vcf_file) 2633 elif connexion_format in ["sqlite"]: 2634 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2636 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2637 """ 2638 It takes a VCF file and updates the INFO column of the variants table in the database with the 2639 INFO column of the VCF file 2640 2641 :param vcf_file: the path to the VCF file 2642 """ 2643 2644 # varaints table 2645 table_variants = self.get_table_variants() 2646 2647 # Loading VCF into temporaire table 2648 skip = self.get_header_length(file=vcf_file) 2649 vcf_df = pd.read_csv( 2650 vcf_file, 2651 sep="\t", 2652 engine="c", 2653 skiprows=skip, 2654 header=0, 2655 low_memory=False, 2656 ) 2657 sql_query_update = f""" 2658 UPDATE {table_variants} as table_variants 2659 SET INFO = concat( 2660 CASE 2661 WHEN INFO NOT IN ('', '.') 2662 THEN INFO 2663 ELSE '' 2664 END, 2665 ( 2666 SELECT 2667 concat( 2668 CASE 2669 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2670 THEN ';' 2671 ELSE '' 2672 END 2673 , 2674 CASE 2675 WHEN table_parquet.INFO NOT IN ('','.') 2676 THEN table_parquet.INFO 2677 ELSE '' 2678 END 2679 ) 2680 FROM vcf_df as table_parquet 2681 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2682 AND table_parquet.\"POS\" = table_variants.\"POS\" 2683 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2684 AND table_parquet.\"REF\" = table_variants.\"REF\" 2685 AND table_parquet.INFO NOT IN ('','.') 2686 ) 2687 ) 2688 ; 2689 """ 2690 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2692 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2693 """ 2694 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2695 table, then updates the INFO column of the variants table with the INFO column of the temporary 2696 table 2697 2698 :param vcf_file: The path to the VCF file you want to update the database with 2699 """ 2700 2701 # Create a temporary table for the VCF 2702 table_vcf = "tmp_vcf" 2703 sql_create = ( 2704 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2705 ) 2706 self.conn.execute(sql_create) 2707 2708 # Loading VCF into temporaire table 2709 vcf_df = pd.read_csv( 2710 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2711 ) 2712 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2713 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2714 2715 # Update table 'variants' with VCF data 2716 # warning: CONCAT as || operator 2717 sql_query_update = f""" 2718 UPDATE variants as table_variants 2719 SET INFO = CASE 2720 WHEN INFO NOT IN ('', '.') 2721 THEN INFO 2722 ELSE '' 2723 END || 2724 ( 2725 SELECT 2726 CASE 2727 WHEN table_variants.INFO NOT IN ('','.') 2728 AND table_vcf.INFO NOT IN ('','.') 2729 THEN ';' 2730 ELSE '' 2731 END || 2732 CASE 2733 WHEN table_vcf.INFO NOT IN ('','.') 2734 THEN table_vcf.INFO 2735 ELSE '' 2736 END 2737 FROM {table_vcf} as table_vcf 2738 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2739 AND table_vcf.\"POS\" = table_variants.\"POS\" 2740 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2741 AND table_vcf.\"REF\" = table_variants.\"REF\" 2742 ) 2743 """ 2744 self.conn.execute(sql_query_update) 2745 2746 # Drop temporary table 2747 sql_drop = f"DROP TABLE {table_vcf}" 2748 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2750 def drop_variants_table(self) -> None: 2751 """ 2752 > This function drops the variants table 2753 """ 2754 2755 table_variants = self.get_table_variants() 2756 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2757 self.conn.execute(sql_table_variants)
This function drops the variants table
2759 def set_variant_id( 2760 self, variant_id_column: str = "variant_id", force: bool = None 2761 ) -> str: 2762 """ 2763 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2764 `#CHROM`, `POS`, `REF`, and `ALT` columns 2765 2766 :param variant_id_column: The name of the column to be created in the variants table, defaults 2767 to variant_id 2768 :type variant_id_column: str (optional) 2769 :param force: If True, the variant_id column will be created even if it already exists 2770 :type force: bool 2771 :return: The name of the column that contains the variant_id 2772 """ 2773 2774 # Assembly 2775 assembly = self.get_param().get( 2776 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2777 ) 2778 2779 # INFO/Tag prefix 2780 prefix = self.get_explode_infos_prefix() 2781 2782 # Explode INFO/SVTYPE 2783 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2784 2785 # variants table 2786 table_variants = self.get_table_variants() 2787 2788 # variant_id column 2789 if not variant_id_column: 2790 variant_id_column = "variant_id" 2791 2792 # Creta variant_id column 2793 if "variant_id" not in self.get_extra_infos() or force: 2794 2795 # Create column 2796 self.add_column( 2797 table_name=table_variants, 2798 column_name=variant_id_column, 2799 column_type="UBIGINT", 2800 default_value="0", 2801 ) 2802 2803 # Update column 2804 self.conn.execute( 2805 f""" 2806 UPDATE {table_variants} 2807 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2808 """ 2809 ) 2810 2811 # Remove added columns 2812 for added_column in added_columns: 2813 self.drop_column(column=added_column) 2814 2815 # return variant_id column name 2816 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2818 def get_variant_id_column( 2819 self, variant_id_column: str = "variant_id", force: bool = None 2820 ) -> str: 2821 """ 2822 This function returns the variant_id column name 2823 2824 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2825 defaults to variant_id 2826 :type variant_id_column: str (optional) 2827 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2828 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2829 if it is not already set, or if it is set 2830 :type force: bool 2831 :return: The variant_id column name. 2832 """ 2833 2834 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2840 def scan_databases( 2841 self, 2842 database_formats: list = ["parquet"], 2843 database_releases: list = ["current"], 2844 ) -> dict: 2845 """ 2846 The function `scan_databases` scans for available databases based on specified formats and 2847 releases. 2848 2849 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2850 of the databases to be scanned. In this case, the accepted format is "parquet" 2851 :type database_formats: list ["parquet"] 2852 :param database_releases: The `database_releases` parameter is a list that specifies the 2853 releases of the databases to be scanned. In the provided function, the default value for 2854 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2855 databases that are in the "current" 2856 :type database_releases: list 2857 :return: The function `scan_databases` returns a dictionary containing information about 2858 databases that match the specified formats and releases. 2859 """ 2860 2861 # Config 2862 config = self.get_config() 2863 2864 # Param 2865 param = self.get_param() 2866 2867 # Param - Assembly 2868 assembly = param.get("assembly", config.get("assembly", None)) 2869 if not assembly: 2870 assembly = DEFAULT_ASSEMBLY 2871 log.warning(f"Default assembly '{assembly}'") 2872 2873 # Scan for availabled databases 2874 log.info( 2875 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2876 ) 2877 databases_infos_dict = databases_infos( 2878 database_folder_releases=database_releases, 2879 database_formats=database_formats, 2880 assembly=assembly, 2881 config=config, 2882 ) 2883 log.info( 2884 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2885 ) 2886 2887 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2889 def annotation(self) -> None: 2890 """ 2891 It annotates the VCF file with the annotations specified in the config file. 2892 """ 2893 2894 # Config 2895 config = self.get_config() 2896 2897 # Param 2898 param = self.get_param() 2899 2900 # Param - Assembly 2901 assembly = param.get("assembly", config.get("assembly", None)) 2902 if not assembly: 2903 assembly = DEFAULT_ASSEMBLY 2904 log.warning(f"Default assembly '{assembly}'") 2905 2906 # annotations databases folders 2907 annotations_databases = set( 2908 config.get("folders", {}) 2909 .get("databases", {}) 2910 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2911 + config.get("folders", {}) 2912 .get("databases", {}) 2913 .get("parquet", ["~/howard/databases/parquet/current"]) 2914 + config.get("folders", {}) 2915 .get("databases", {}) 2916 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2917 ) 2918 2919 # Get param annotations 2920 if param.get("annotations", None) and isinstance( 2921 param.get("annotations", None), str 2922 ): 2923 log.debug(param.get("annotations", None)) 2924 param_annotation_list = param.get("annotations").split(",") 2925 else: 2926 param_annotation_list = [] 2927 2928 # Each tools param 2929 if param.get("annotation_parquet", None) != None: 2930 log.debug( 2931 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2932 ) 2933 if isinstance(param.get("annotation_parquet", None), list): 2934 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2935 else: 2936 param_annotation_list.append(param.get("annotation_parquet")) 2937 if param.get("annotation_snpsift", None) != None: 2938 if isinstance(param.get("annotation_snpsift", None), list): 2939 param_annotation_list.append( 2940 "snpsift:" 2941 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2942 ) 2943 else: 2944 param_annotation_list.append( 2945 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2946 ) 2947 if param.get("annotation_snpeff", None) != None: 2948 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2949 if param.get("annotation_bcftools", None) != None: 2950 if isinstance(param.get("annotation_bcftools", None), list): 2951 param_annotation_list.append( 2952 "bcftools:" 2953 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2954 ) 2955 else: 2956 param_annotation_list.append( 2957 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2958 ) 2959 if param.get("annotation_annovar", None) != None: 2960 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2961 if param.get("annotation_exomiser", None) != None: 2962 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2963 if param.get("annotation_splice", None) != None: 2964 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2965 2966 # Merge param annotations list 2967 param["annotations"] = ",".join(param_annotation_list) 2968 2969 # debug 2970 log.debug(f"param_annotations={param['annotations']}") 2971 2972 if param.get("annotations"): 2973 2974 # Log 2975 # log.info("Annotations - Check annotation parameters") 2976 2977 if not "annotation" in param: 2978 param["annotation"] = {} 2979 2980 # List of annotations parameters 2981 annotations_list_input = {} 2982 if isinstance(param.get("annotations", None), str): 2983 annotation_file_list = [ 2984 value for value in param.get("annotations", "").split(",") 2985 ] 2986 for annotation_file in annotation_file_list: 2987 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2988 else: 2989 annotations_list_input = param.get("annotations", {}) 2990 2991 log.info(f"Quick Annotations:") 2992 for annotation_key in list(annotations_list_input.keys()): 2993 log.info(f" {annotation_key}") 2994 2995 # List of annotations and associated fields 2996 annotations_list = {} 2997 2998 for annotation_file in annotations_list_input: 2999 3000 # Explode annotations if ALL 3001 if ( 3002 annotation_file.upper() == "ALL" 3003 or annotation_file.upper().startswith("ALL:") 3004 ): 3005 3006 # check ALL parameters (formats, releases) 3007 annotation_file_split = annotation_file.split(":") 3008 database_formats = "parquet" 3009 database_releases = "current" 3010 for annotation_file_option in annotation_file_split[1:]: 3011 database_all_options_split = annotation_file_option.split("=") 3012 if database_all_options_split[0] == "format": 3013 database_formats = database_all_options_split[1].split("+") 3014 if database_all_options_split[0] == "release": 3015 database_releases = database_all_options_split[1].split("+") 3016 3017 # Scan for availabled databases 3018 databases_infos_dict = self.scan_databases( 3019 database_formats=database_formats, 3020 database_releases=database_releases, 3021 ) 3022 3023 # Add found databases in annotation parameters 3024 for database_infos in databases_infos_dict.keys(): 3025 annotations_list[database_infos] = {"INFO": None} 3026 3027 else: 3028 annotations_list[annotation_file] = annotations_list_input[ 3029 annotation_file 3030 ] 3031 3032 # Check each databases 3033 if len(annotations_list): 3034 3035 log.info( 3036 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3037 ) 3038 3039 for annotation_file in annotations_list: 3040 3041 # Init 3042 annotations = annotations_list.get(annotation_file, None) 3043 3044 # Annotation snpEff 3045 if annotation_file.startswith("snpeff"): 3046 3047 log.debug(f"Quick Annotation snpEff") 3048 3049 if "snpeff" not in param["annotation"]: 3050 param["annotation"]["snpeff"] = {} 3051 3052 if "options" not in param["annotation"]["snpeff"]: 3053 param["annotation"]["snpeff"]["options"] = "" 3054 3055 # snpEff options in annotations 3056 param["annotation"]["snpeff"]["options"] = "".join( 3057 annotation_file.split(":")[1:] 3058 ) 3059 3060 # Annotation Annovar 3061 elif annotation_file.startswith("annovar"): 3062 3063 log.debug(f"Quick Annotation Annovar") 3064 3065 if "annovar" not in param["annotation"]: 3066 param["annotation"]["annovar"] = {} 3067 3068 if "annotations" not in param["annotation"]["annovar"]: 3069 param["annotation"]["annovar"]["annotations"] = {} 3070 3071 # Options 3072 annotation_file_split = annotation_file.split(":") 3073 for annotation_file_annotation in annotation_file_split[1:]: 3074 if annotation_file_annotation: 3075 param["annotation"]["annovar"]["annotations"][ 3076 annotation_file_annotation 3077 ] = annotations 3078 3079 # Annotation Exomiser 3080 elif annotation_file.startswith("exomiser"): 3081 3082 log.debug(f"Quick Annotation Exomiser") 3083 3084 param["annotation"]["exomiser"] = params_string_to_dict( 3085 annotation_file 3086 ) 3087 3088 # Annotation Splice 3089 elif annotation_file.startswith("splice"): 3090 3091 log.debug(f"Quick Annotation Splice") 3092 3093 param["annotation"]["splice"] = params_string_to_dict( 3094 annotation_file 3095 ) 3096 3097 # Annotation Parquet or BCFTOOLS 3098 else: 3099 3100 # Tools detection 3101 if annotation_file.startswith("bcftools:"): 3102 annotation_tool_initial = "bcftools" 3103 annotation_file = ":".join(annotation_file.split(":")[1:]) 3104 elif annotation_file.startswith("snpsift:"): 3105 annotation_tool_initial = "snpsift" 3106 annotation_file = ":".join(annotation_file.split(":")[1:]) 3107 elif annotation_file.startswith("bigwig:"): 3108 annotation_tool_initial = "bigwig" 3109 annotation_file = ":".join(annotation_file.split(":")[1:]) 3110 else: 3111 annotation_tool_initial = None 3112 3113 # list of files 3114 annotation_file_list = annotation_file.replace("+", ":").split( 3115 ":" 3116 ) 3117 3118 for annotation_file in annotation_file_list: 3119 3120 if annotation_file: 3121 3122 # Annotation tool initial 3123 annotation_tool = annotation_tool_initial 3124 3125 # Find file 3126 annotation_file_found = None 3127 3128 if os.path.exists(annotation_file): 3129 annotation_file_found = annotation_file 3130 elif os.path.exists(full_path(annotation_file)): 3131 annotation_file_found = full_path(annotation_file) 3132 else: 3133 # Find within assembly folders 3134 for annotations_database in annotations_databases: 3135 found_files = find_all( 3136 annotation_file, 3137 os.path.join( 3138 annotations_database, assembly 3139 ), 3140 ) 3141 if len(found_files) > 0: 3142 annotation_file_found = found_files[0] 3143 break 3144 if not annotation_file_found and not assembly: 3145 # Find within folders 3146 for ( 3147 annotations_database 3148 ) in annotations_databases: 3149 found_files = find_all( 3150 annotation_file, annotations_database 3151 ) 3152 if len(found_files) > 0: 3153 annotation_file_found = found_files[0] 3154 break 3155 log.debug( 3156 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3157 ) 3158 3159 # Full path 3160 annotation_file_found = full_path(annotation_file_found) 3161 3162 if annotation_file_found: 3163 3164 database = Database(database=annotation_file_found) 3165 quick_annotation_format = database.get_format() 3166 quick_annotation_is_compressed = ( 3167 database.is_compressed() 3168 ) 3169 quick_annotation_is_indexed = os.path.exists( 3170 f"{annotation_file_found}.tbi" 3171 ) 3172 bcftools_preference = False 3173 3174 # Check Annotation Tool 3175 if not annotation_tool: 3176 if ( 3177 bcftools_preference 3178 and quick_annotation_format 3179 in ["vcf", "bed"] 3180 and quick_annotation_is_compressed 3181 and quick_annotation_is_indexed 3182 ): 3183 annotation_tool = "bcftools" 3184 elif quick_annotation_format in [ 3185 "vcf", 3186 "bed", 3187 "tsv", 3188 "tsv", 3189 "csv", 3190 "json", 3191 "tbl", 3192 "parquet", 3193 "duckdb", 3194 ]: 3195 annotation_tool = "parquet" 3196 elif quick_annotation_format in ["bw"]: 3197 annotation_tool = "bigwig" 3198 else: 3199 log.error( 3200 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3201 ) 3202 raise ValueError( 3203 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3204 ) 3205 3206 log.debug( 3207 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3208 ) 3209 3210 # Annotation Tool dispatch 3211 if annotation_tool: 3212 if annotation_tool not in param["annotation"]: 3213 param["annotation"][annotation_tool] = {} 3214 if ( 3215 "annotations" 3216 not in param["annotation"][annotation_tool] 3217 ): 3218 param["annotation"][annotation_tool][ 3219 "annotations" 3220 ] = {} 3221 param["annotation"][annotation_tool][ 3222 "annotations" 3223 ][annotation_file_found] = annotations 3224 3225 else: 3226 log.warning( 3227 f"Quick Annotation File {annotation_file} does NOT exist" 3228 ) 3229 3230 self.set_param(param) 3231 3232 if param.get("annotation", None): 3233 log.info("Annotations") 3234 if param.get("annotation", {}).get("parquet", None): 3235 log.info("Annotations 'parquet'...") 3236 self.annotation_parquet() 3237 if param.get("annotation", {}).get("bcftools", None): 3238 log.info("Annotations 'bcftools'...") 3239 self.annotation_bcftools() 3240 if param.get("annotation", {}).get("snpsift", None): 3241 log.info("Annotations 'snpsift'...") 3242 self.annotation_snpsift() 3243 if param.get("annotation", {}).get("bigwig", None): 3244 log.info("Annotations 'bigwig'...") 3245 self.annotation_bigwig() 3246 if param.get("annotation", {}).get("annovar", None): 3247 log.info("Annotations 'annovar'...") 3248 self.annotation_annovar() 3249 if param.get("annotation", {}).get("snpeff", None): 3250 log.info("Annotations 'snpeff'...") 3251 self.annotation_snpeff() 3252 if param.get("annotation", {}).get("exomiser", None) is not None: 3253 log.info("Annotations 'exomiser'...") 3254 self.annotation_exomiser() 3255 if param.get("annotation", {}).get("splice", None) is not None: 3256 log.info("Annotations 'splice' ...") 3257 self.annotation_splice() 3258 3259 # Explode INFOS fields into table fields 3260 if self.get_explode_infos(): 3261 self.explode_infos( 3262 prefix=self.get_explode_infos_prefix(), 3263 fields=self.get_explode_infos_fields(), 3264 force=True, 3265 )
It annotates the VCF file with the annotations specified in the config file.
3267 def annotation_bigwig(self, threads: int = None) -> None: 3268 """ 3269 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3270 3271 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3272 number of threads to be used for parallel processing during the annotation process. If the 3273 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3274 threads to use based on the system configuration 3275 :type threads: int 3276 :return: True 3277 """ 3278 3279 # DEBUG 3280 log.debug("Start annotation with bigwig databases") 3281 3282 # # Threads 3283 # if not threads: 3284 # threads = self.get_threads() 3285 # log.debug("Threads: " + str(threads)) 3286 3287 # Config 3288 config = self.get_config() 3289 log.debug("Config: " + str(config)) 3290 3291 # Config - BCFTools databases folders 3292 databases_folders = set( 3293 self.get_config() 3294 .get("folders", {}) 3295 .get("databases", {}) 3296 .get("annotations", ["."]) 3297 + self.get_config() 3298 .get("folders", {}) 3299 .get("databases", {}) 3300 .get("bigwig", ["."]) 3301 ) 3302 log.debug("Databases annotations: " + str(databases_folders)) 3303 3304 # Param 3305 annotations = ( 3306 self.get_param() 3307 .get("annotation", {}) 3308 .get("bigwig", {}) 3309 .get("annotations", None) 3310 ) 3311 log.debug("Annotations: " + str(annotations)) 3312 3313 # Assembly 3314 assembly = self.get_param().get( 3315 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3316 ) 3317 3318 # Data 3319 table_variants = self.get_table_variants() 3320 3321 # Check if not empty 3322 log.debug("Check if not empty") 3323 sql_query_chromosomes = ( 3324 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3325 ) 3326 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3327 if not sql_query_chromosomes_df["count"][0]: 3328 log.info(f"VCF empty") 3329 return 3330 3331 # VCF header 3332 vcf_reader = self.get_header() 3333 log.debug("Initial header: " + str(vcf_reader.infos)) 3334 3335 # Existing annotations 3336 for vcf_annotation in self.get_header().infos: 3337 3338 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3339 log.debug( 3340 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3341 ) 3342 3343 if annotations: 3344 3345 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3346 3347 # Export VCF file 3348 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3349 3350 # annotation_bigwig_config 3351 annotation_bigwig_config_list = [] 3352 3353 for annotation in annotations: 3354 annotation_fields = annotations[annotation] 3355 3356 # Annotation Name 3357 annotation_name = os.path.basename(annotation) 3358 3359 if not annotation_fields: 3360 annotation_fields = {"INFO": None} 3361 3362 log.debug(f"Annotation '{annotation_name}'") 3363 log.debug( 3364 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3365 ) 3366 3367 # Create Database 3368 database = Database( 3369 database=annotation, 3370 databases_folders=databases_folders, 3371 assembly=assembly, 3372 ) 3373 3374 # Find files 3375 db_file = database.get_database() 3376 db_file = full_path(db_file) 3377 db_hdr_file = database.get_header_file() 3378 db_hdr_file = full_path(db_hdr_file) 3379 db_file_type = database.get_format() 3380 3381 # If db_file is http ? 3382 if database.get_database().startswith("http"): 3383 3384 # Datbase is HTTP URL 3385 db_file_is_http = True 3386 3387 # DB file keep as URL 3388 db_file = database.get_database() 3389 log.warning( 3390 f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)" 3391 ) 3392 3393 # Retrieve automatic annotation field name 3394 annotation_field = clean_annotation_field( 3395 os.path.basename(db_file).replace(".bw", "") 3396 ) 3397 log.debug( 3398 f"Create header file with annotation field '{annotation_field}' is an HTTP URL" 3399 ) 3400 3401 # Create automatic header file 3402 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3403 with open(db_hdr_file, "w") as f: 3404 f.write("##fileformat=VCFv4.2\n") 3405 f.write( 3406 f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""" 3407 ) 3408 f.write(f"#CHROM START END {annotation_field}\n") 3409 3410 else: 3411 3412 # Datbase is NOT HTTP URL 3413 db_file_is_http = False 3414 3415 # Check index - try to create if not exists 3416 if ( 3417 db_file is None 3418 or db_hdr_file is None 3419 or (not os.path.exists(db_file) and not db_file_is_http) 3420 or not os.path.exists(db_hdr_file) 3421 or not db_file_type in ["bw"] 3422 ): 3423 # if False: 3424 log.error("Annotation failed: database not valid") 3425 log.error(f"Annotation annotation file: {db_file}") 3426 log.error(f"Annotation annotation file type: {db_file_type}") 3427 log.error(f"Annotation annotation header: {db_hdr_file}") 3428 raise ValueError( 3429 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3430 ) 3431 else: 3432 3433 # Log 3434 log.debug( 3435 f"Annotation '{annotation}' - file: " 3436 + str(db_file) 3437 + " and " 3438 + str(db_hdr_file) 3439 ) 3440 3441 # Load header as VCF object 3442 db_hdr_vcf = Variants(input=db_hdr_file) 3443 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3444 log.debug( 3445 "Annotation database header: " 3446 + str(db_hdr_vcf_header_infos) 3447 ) 3448 3449 # For all fields in database 3450 annotation_fields_full = False 3451 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3452 annotation_fields = { 3453 key: key for key in db_hdr_vcf_header_infos 3454 } 3455 log.debug( 3456 "Annotation database header - All annotations added: " 3457 + str(annotation_fields) 3458 ) 3459 annotation_fields_full = True 3460 3461 # Init 3462 cyvcf2_header_rename_dict = {} 3463 cyvcf2_header_list = [] 3464 cyvcf2_header_indexes = {} 3465 3466 # process annotation fields 3467 for annotation_field in annotation_fields: 3468 3469 # New annotation name 3470 annotation_field_new = annotation_fields[annotation_field] 3471 3472 # Check annotation field and index in header 3473 if ( 3474 annotation_field 3475 in db_hdr_vcf.get_header_columns_as_list() 3476 ): 3477 annotation_field_index = ( 3478 db_hdr_vcf.get_header_columns_as_list().index( 3479 annotation_field 3480 ) 3481 - 3 3482 ) 3483 cyvcf2_header_indexes[annotation_field_new] = ( 3484 annotation_field_index 3485 ) 3486 else: 3487 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3488 log.error(msg_err) 3489 raise ValueError(msg_err) 3490 3491 # Append annotation field in cyvcf2 header list 3492 cyvcf2_header_rename_dict[annotation_field_new] = ( 3493 db_hdr_vcf_header_infos[annotation_field].id 3494 ) 3495 cyvcf2_header_list.append( 3496 { 3497 "ID": annotation_field_new, 3498 "Number": db_hdr_vcf_header_infos[ 3499 annotation_field 3500 ].num, 3501 "Type": db_hdr_vcf_header_infos[ 3502 annotation_field 3503 ].type, 3504 "Description": db_hdr_vcf_header_infos[ 3505 annotation_field 3506 ].desc, 3507 } 3508 ) 3509 3510 # Add header on VCF 3511 vcf_reader.infos[annotation_field_new] = vcf.parser._Info( 3512 annotation_field_new, 3513 db_hdr_vcf_header_infos[annotation_field].num, 3514 db_hdr_vcf_header_infos[annotation_field].type, 3515 db_hdr_vcf_header_infos[annotation_field].desc, 3516 "HOWARD BigWig annotation", 3517 "unknown", 3518 self.code_type_map[ 3519 db_hdr_vcf_header_infos[annotation_field].type 3520 ], 3521 ) 3522 3523 # Load bigwig database 3524 bw_db = pyBigWig.open(db_file) 3525 if bw_db.isBigWig(): 3526 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3527 else: 3528 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3529 log.error(msg_err) 3530 raise ValueError(msg_err) 3531 3532 annotation_bigwig_config_list.append( 3533 { 3534 "db_file": db_file, 3535 "bw_db": bw_db, 3536 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3537 "cyvcf2_header_list": cyvcf2_header_list, 3538 "cyvcf2_header_indexes": cyvcf2_header_indexes, 3539 } 3540 ) 3541 3542 # Annotate 3543 if annotation_bigwig_config_list: 3544 3545 # Annotation config 3546 log.debug( 3547 f"annotation_bigwig_config={annotation_bigwig_config_list}" 3548 ) 3549 3550 # Export VCF file 3551 self.export_variant_vcf( 3552 vcf_file=tmp_vcf_name, 3553 remove_info=True, 3554 add_samples=False, 3555 index=True, 3556 ) 3557 3558 # Load input tmp file 3559 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3560 3561 # Add header in input file 3562 for annotation_bigwig_config in annotation_bigwig_config_list: 3563 for cyvcf2_header_field in annotation_bigwig_config.get( 3564 "cyvcf2_header_list", [] 3565 ): 3566 log.info( 3567 f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'" 3568 ) 3569 input_vcf.add_info_to_header(cyvcf2_header_field) 3570 3571 # Create output VCF file 3572 output_vcf_file = os.path.join(tmp_dir, "output.vcf.gz") 3573 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3574 3575 # Fetch variants 3576 log.info(f"Annotations 'bigwig' start...") 3577 for variant in input_vcf: 3578 3579 for annotation_bigwig_config in annotation_bigwig_config_list: 3580 3581 # DB and indexes 3582 bw_db = annotation_bigwig_config.get("bw_db", None) 3583 cyvcf2_header_indexes = annotation_bigwig_config.get( 3584 "cyvcf2_header_indexes", None 3585 ) 3586 3587 # Retrieve value from chrom pos 3588 res = bw_db.values( 3589 variant.CHROM, variant.POS - 1, variant.POS 3590 ) 3591 3592 # For each annotation fields (and indexes) 3593 for cyvcf2_header_index in cyvcf2_header_indexes: 3594 3595 # If value is NOT nNone 3596 if not np.isnan( 3597 res[cyvcf2_header_indexes[cyvcf2_header_index]] 3598 ): 3599 variant.INFO[cyvcf2_header_index] = res[ 3600 cyvcf2_header_indexes[cyvcf2_header_index] 3601 ] 3602 3603 # Add record in output file 3604 output_vcf.write_record(variant) 3605 3606 # Log 3607 log.debug(f"Annotation done.") 3608 3609 # Close and write file 3610 log.info(f"Annotations 'bigwig' write...") 3611 output_vcf.close() 3612 log.debug(f"Write done.") 3613 3614 # Update variants 3615 log.info(f"Annotations 'bigwig' update...") 3616 self.update_from_vcf(output_vcf_file) 3617 log.debug(f"Update done.") 3618 3619 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3621 def annotation_snpsift(self, threads: int = None) -> None: 3622 """ 3623 This function annotate with bcftools 3624 3625 :param threads: Number of threads to use 3626 :return: the value of the variable "return_value". 3627 """ 3628 3629 # DEBUG 3630 log.debug("Start annotation with bcftools databases") 3631 3632 # Threads 3633 if not threads: 3634 threads = self.get_threads() 3635 log.debug("Threads: " + str(threads)) 3636 3637 # Config 3638 config = self.get_config() 3639 log.debug("Config: " + str(config)) 3640 3641 # Config - snpSift 3642 snpsift_bin_command = get_bin_command( 3643 bin="SnpSift.jar", 3644 tool="snpsift", 3645 bin_type="jar", 3646 config=config, 3647 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3648 ) 3649 if not snpsift_bin_command: 3650 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3651 log.error(msg_err) 3652 raise ValueError(msg_err) 3653 3654 # Config - bcftools 3655 bcftools_bin_command = get_bin_command( 3656 bin="bcftools", 3657 tool="bcftools", 3658 bin_type="bin", 3659 config=config, 3660 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3661 ) 3662 if not bcftools_bin_command: 3663 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3664 log.error(msg_err) 3665 raise ValueError(msg_err) 3666 3667 # Config - BCFTools databases folders 3668 databases_folders = set( 3669 self.get_config() 3670 .get("folders", {}) 3671 .get("databases", {}) 3672 .get("annotations", ["."]) 3673 + self.get_config() 3674 .get("folders", {}) 3675 .get("databases", {}) 3676 .get("bcftools", ["."]) 3677 ) 3678 log.debug("Databases annotations: " + str(databases_folders)) 3679 3680 # Param 3681 annotations = ( 3682 self.get_param() 3683 .get("annotation", {}) 3684 .get("snpsift", {}) 3685 .get("annotations", None) 3686 ) 3687 log.debug("Annotations: " + str(annotations)) 3688 3689 # Assembly 3690 assembly = self.get_param().get( 3691 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3692 ) 3693 3694 # Data 3695 table_variants = self.get_table_variants() 3696 3697 # Check if not empty 3698 log.debug("Check if not empty") 3699 sql_query_chromosomes = ( 3700 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3701 ) 3702 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3703 if not sql_query_chromosomes_df["count"][0]: 3704 log.info(f"VCF empty") 3705 return 3706 3707 # VCF header 3708 vcf_reader = self.get_header() 3709 log.debug("Initial header: " + str(vcf_reader.infos)) 3710 3711 # Existing annotations 3712 for vcf_annotation in self.get_header().infos: 3713 3714 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3715 log.debug( 3716 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3717 ) 3718 3719 if annotations: 3720 3721 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3722 3723 # Export VCF file 3724 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3725 3726 # Init 3727 commands = {} 3728 3729 for annotation in annotations: 3730 annotation_fields = annotations[annotation] 3731 3732 # Annotation Name 3733 annotation_name = os.path.basename(annotation) 3734 3735 if not annotation_fields: 3736 annotation_fields = {"INFO": None} 3737 3738 log.debug(f"Annotation '{annotation_name}'") 3739 log.debug( 3740 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3741 ) 3742 3743 # Create Database 3744 database = Database( 3745 database=annotation, 3746 databases_folders=databases_folders, 3747 assembly=assembly, 3748 ) 3749 3750 # Find files 3751 db_file = database.get_database() 3752 db_file = full_path(db_file) 3753 db_hdr_file = database.get_header_file() 3754 db_hdr_file = full_path(db_hdr_file) 3755 db_file_type = database.get_format() 3756 db_tbi_file = f"{db_file}.tbi" 3757 db_file_compressed = database.is_compressed() 3758 3759 # Check if compressed 3760 if not db_file_compressed: 3761 log.error( 3762 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3763 ) 3764 raise ValueError( 3765 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3766 ) 3767 3768 # Check if indexed 3769 if not os.path.exists(db_tbi_file): 3770 log.error( 3771 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3772 ) 3773 raise ValueError( 3774 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3775 ) 3776 3777 # Check index - try to create if not exists 3778 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3779 log.error("Annotation failed: database not valid") 3780 log.error(f"Annotation annotation file: {db_file}") 3781 log.error(f"Annotation annotation header: {db_hdr_file}") 3782 log.error(f"Annotation annotation index: {db_tbi_file}") 3783 raise ValueError( 3784 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3785 ) 3786 else: 3787 3788 log.debug( 3789 f"Annotation '{annotation}' - file: " 3790 + str(db_file) 3791 + " and " 3792 + str(db_hdr_file) 3793 ) 3794 3795 # Load header as VCF object 3796 db_hdr_vcf = Variants(input=db_hdr_file) 3797 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3798 log.debug( 3799 "Annotation database header: " 3800 + str(db_hdr_vcf_header_infos) 3801 ) 3802 3803 # For all fields in database 3804 annotation_fields_full = False 3805 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3806 annotation_fields = { 3807 key: key for key in db_hdr_vcf_header_infos 3808 } 3809 log.debug( 3810 "Annotation database header - All annotations added: " 3811 + str(annotation_fields) 3812 ) 3813 annotation_fields_full = True 3814 3815 # # Create file for field rename 3816 # log.debug("Create file for field rename") 3817 # tmp_rename = NamedTemporaryFile( 3818 # prefix=self.get_prefix(), 3819 # dir=self.get_tmp_dir(), 3820 # suffix=".rename", 3821 # delete=False, 3822 # ) 3823 # tmp_rename_name = tmp_rename.name 3824 # tmp_files.append(tmp_rename_name) 3825 3826 # Number of fields 3827 nb_annotation_field = 0 3828 annotation_list = [] 3829 annotation_infos_rename_list = [] 3830 3831 for annotation_field in annotation_fields: 3832 3833 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3834 annotation_fields_new_name = annotation_fields.get( 3835 annotation_field, annotation_field 3836 ) 3837 if not annotation_fields_new_name: 3838 annotation_fields_new_name = annotation_field 3839 3840 # Check if field is in DB and if field is not elready in input data 3841 if ( 3842 annotation_field in db_hdr_vcf.get_header().infos 3843 and annotation_fields_new_name 3844 not in self.get_header().infos 3845 ): 3846 3847 log.info( 3848 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3849 ) 3850 3851 # BCFTools annotate param to rename fields 3852 if annotation_field != annotation_fields_new_name: 3853 annotation_infos_rename_list.append( 3854 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3855 ) 3856 3857 # Add INFO field to header 3858 db_hdr_vcf_header_infos_number = ( 3859 db_hdr_vcf_header_infos[annotation_field].num or "." 3860 ) 3861 db_hdr_vcf_header_infos_type = ( 3862 db_hdr_vcf_header_infos[annotation_field].type 3863 or "String" 3864 ) 3865 db_hdr_vcf_header_infos_description = ( 3866 db_hdr_vcf_header_infos[annotation_field].desc 3867 or f"{annotation_field} description" 3868 ) 3869 db_hdr_vcf_header_infos_source = ( 3870 db_hdr_vcf_header_infos[annotation_field].source 3871 or "unknown" 3872 ) 3873 db_hdr_vcf_header_infos_version = ( 3874 db_hdr_vcf_header_infos[annotation_field].version 3875 or "unknown" 3876 ) 3877 3878 vcf_reader.infos[annotation_fields_new_name] = ( 3879 vcf.parser._Info( 3880 annotation_fields_new_name, 3881 db_hdr_vcf_header_infos_number, 3882 db_hdr_vcf_header_infos_type, 3883 db_hdr_vcf_header_infos_description, 3884 db_hdr_vcf_header_infos_source, 3885 db_hdr_vcf_header_infos_version, 3886 self.code_type_map[ 3887 db_hdr_vcf_header_infos_type 3888 ], 3889 ) 3890 ) 3891 3892 annotation_list.append(annotation_field) 3893 3894 nb_annotation_field += 1 3895 3896 else: 3897 3898 if ( 3899 annotation_field 3900 not in db_hdr_vcf.get_header().infos 3901 ): 3902 log.warning( 3903 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3904 ) 3905 if ( 3906 annotation_fields_new_name 3907 in self.get_header().infos 3908 ): 3909 log.warning( 3910 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3911 ) 3912 3913 log.info( 3914 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3915 ) 3916 3917 annotation_infos = ",".join(annotation_list) 3918 3919 if annotation_infos != "": 3920 3921 # Annotated VCF (and error file) 3922 tmp_annotation_vcf_name = os.path.join( 3923 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3924 ) 3925 tmp_annotation_vcf_name_err = ( 3926 tmp_annotation_vcf_name + ".err" 3927 ) 3928 3929 # Add fields to annotate 3930 if not annotation_fields_full: 3931 annotation_infos_option = f"-info {annotation_infos}" 3932 else: 3933 annotation_infos_option = "" 3934 3935 # Info fields rename 3936 if annotation_infos_rename_list: 3937 annotation_infos_rename = " -c " + ",".join( 3938 annotation_infos_rename_list 3939 ) 3940 else: 3941 annotation_infos_rename = "" 3942 3943 # Annotate command 3944 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3945 3946 # Add command 3947 commands[command_annotate] = tmp_annotation_vcf_name 3948 3949 if commands: 3950 3951 # Export VCF file 3952 self.export_variant_vcf( 3953 vcf_file=tmp_vcf_name, 3954 remove_info=True, 3955 add_samples=False, 3956 index=True, 3957 ) 3958 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3959 3960 # Num command 3961 nb_command = 0 3962 3963 # Annotate 3964 for command_annotate in commands: 3965 nb_command += 1 3966 log.info( 3967 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3968 ) 3969 log.debug(f"command_annotate={command_annotate}") 3970 run_parallel_commands([command_annotate], threads) 3971 3972 # Debug 3973 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3974 3975 # Update variants 3976 log.info( 3977 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3978 ) 3979 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3981 def annotation_bcftools(self, threads: int = None) -> None: 3982 """ 3983 This function annotate with bcftools 3984 3985 :param threads: Number of threads to use 3986 :return: the value of the variable "return_value". 3987 """ 3988 3989 # DEBUG 3990 log.debug("Start annotation with bcftools databases") 3991 3992 # Threads 3993 if not threads: 3994 threads = self.get_threads() 3995 log.debug("Threads: " + str(threads)) 3996 3997 # Config 3998 config = self.get_config() 3999 log.debug("Config: " + str(config)) 4000 4001 # DEBUG 4002 delete_tmp = True 4003 if self.get_config().get("verbosity", "warning") in ["debug"]: 4004 delete_tmp = False 4005 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 4006 4007 # Config - BCFTools bin command 4008 bcftools_bin_command = get_bin_command( 4009 bin="bcftools", 4010 tool="bcftools", 4011 bin_type="bin", 4012 config=config, 4013 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 4014 ) 4015 if not bcftools_bin_command: 4016 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 4017 log.error(msg_err) 4018 raise ValueError(msg_err) 4019 4020 # Config - BCFTools databases folders 4021 databases_folders = set( 4022 self.get_config() 4023 .get("folders", {}) 4024 .get("databases", {}) 4025 .get("annotations", ["."]) 4026 + self.get_config() 4027 .get("folders", {}) 4028 .get("databases", {}) 4029 .get("bcftools", ["."]) 4030 ) 4031 log.debug("Databases annotations: " + str(databases_folders)) 4032 4033 # Param 4034 annotations = ( 4035 self.get_param() 4036 .get("annotation", {}) 4037 .get("bcftools", {}) 4038 .get("annotations", None) 4039 ) 4040 log.debug("Annotations: " + str(annotations)) 4041 4042 # Assembly 4043 assembly = self.get_param().get( 4044 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 4045 ) 4046 4047 # Data 4048 table_variants = self.get_table_variants() 4049 4050 # Check if not empty 4051 log.debug("Check if not empty") 4052 sql_query_chromosomes = ( 4053 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4054 ) 4055 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 4056 if not sql_query_chromosomes_df["count"][0]: 4057 log.info(f"VCF empty") 4058 return 4059 4060 # Export in VCF 4061 log.debug("Create initial file to annotate") 4062 tmp_vcf = NamedTemporaryFile( 4063 prefix=self.get_prefix(), 4064 dir=self.get_tmp_dir(), 4065 suffix=".vcf.gz", 4066 delete=False, 4067 ) 4068 tmp_vcf_name = tmp_vcf.name 4069 4070 # VCF header 4071 vcf_reader = self.get_header() 4072 log.debug("Initial header: " + str(vcf_reader.infos)) 4073 4074 # Existing annotations 4075 for vcf_annotation in self.get_header().infos: 4076 4077 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4078 log.debug( 4079 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4080 ) 4081 4082 if annotations: 4083 4084 tmp_ann_vcf_list = [] 4085 commands = [] 4086 tmp_files = [] 4087 err_files = [] 4088 4089 for annotation in annotations: 4090 annotation_fields = annotations[annotation] 4091 4092 # Annotation Name 4093 annotation_name = os.path.basename(annotation) 4094 4095 if not annotation_fields: 4096 annotation_fields = {"INFO": None} 4097 4098 log.debug(f"Annotation '{annotation_name}'") 4099 log.debug( 4100 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4101 ) 4102 4103 # Create Database 4104 database = Database( 4105 database=annotation, 4106 databases_folders=databases_folders, 4107 assembly=assembly, 4108 ) 4109 4110 # Find files 4111 db_file = database.get_database() 4112 db_file = full_path(db_file) 4113 db_hdr_file = database.get_header_file() 4114 db_hdr_file = full_path(db_hdr_file) 4115 db_file_type = database.get_format() 4116 db_tbi_file = f"{db_file}.tbi" 4117 db_file_compressed = database.is_compressed() 4118 4119 # Check if compressed 4120 if not db_file_compressed: 4121 log.error( 4122 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4123 ) 4124 raise ValueError( 4125 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4126 ) 4127 4128 # Check if indexed 4129 if not os.path.exists(db_tbi_file): 4130 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4131 raise ValueError( 4132 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4133 ) 4134 4135 # Check index - try to create if not exists 4136 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4137 log.error("Annotation failed: database not valid") 4138 log.error(f"Annotation annotation file: {db_file}") 4139 log.error(f"Annotation annotation header: {db_hdr_file}") 4140 log.error(f"Annotation annotation index: {db_tbi_file}") 4141 raise ValueError( 4142 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4143 ) 4144 else: 4145 4146 log.debug( 4147 f"Annotation '{annotation}' - file: " 4148 + str(db_file) 4149 + " and " 4150 + str(db_hdr_file) 4151 ) 4152 4153 # Load header as VCF object 4154 db_hdr_vcf = Variants(input=db_hdr_file) 4155 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4156 log.debug( 4157 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4158 ) 4159 4160 # For all fields in database 4161 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4162 annotation_fields = { 4163 key: key for key in db_hdr_vcf_header_infos 4164 } 4165 log.debug( 4166 "Annotation database header - All annotations added: " 4167 + str(annotation_fields) 4168 ) 4169 4170 # Number of fields 4171 nb_annotation_field = 0 4172 annotation_list = [] 4173 4174 for annotation_field in annotation_fields: 4175 4176 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4177 annotation_fields_new_name = annotation_fields.get( 4178 annotation_field, annotation_field 4179 ) 4180 if not annotation_fields_new_name: 4181 annotation_fields_new_name = annotation_field 4182 4183 # Check if field is in DB and if field is not elready in input data 4184 if ( 4185 annotation_field in db_hdr_vcf.get_header().infos 4186 and annotation_fields_new_name 4187 not in self.get_header().infos 4188 ): 4189 4190 log.info( 4191 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4192 ) 4193 4194 # Add INFO field to header 4195 db_hdr_vcf_header_infos_number = ( 4196 db_hdr_vcf_header_infos[annotation_field].num or "." 4197 ) 4198 db_hdr_vcf_header_infos_type = ( 4199 db_hdr_vcf_header_infos[annotation_field].type 4200 or "String" 4201 ) 4202 db_hdr_vcf_header_infos_description = ( 4203 db_hdr_vcf_header_infos[annotation_field].desc 4204 or f"{annotation_field} description" 4205 ) 4206 db_hdr_vcf_header_infos_source = ( 4207 db_hdr_vcf_header_infos[annotation_field].source 4208 or "unknown" 4209 ) 4210 db_hdr_vcf_header_infos_version = ( 4211 db_hdr_vcf_header_infos[annotation_field].version 4212 or "unknown" 4213 ) 4214 4215 vcf_reader.infos[annotation_fields_new_name] = ( 4216 vcf.parser._Info( 4217 annotation_fields_new_name, 4218 db_hdr_vcf_header_infos_number, 4219 db_hdr_vcf_header_infos_type, 4220 db_hdr_vcf_header_infos_description, 4221 db_hdr_vcf_header_infos_source, 4222 db_hdr_vcf_header_infos_version, 4223 self.code_type_map[db_hdr_vcf_header_infos_type], 4224 ) 4225 ) 4226 4227 # annotation_list.append(annotation_field) 4228 if annotation_field != annotation_fields_new_name: 4229 annotation_list.append( 4230 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4231 ) 4232 else: 4233 annotation_list.append(annotation_field) 4234 4235 nb_annotation_field += 1 4236 4237 else: 4238 4239 if annotation_field not in db_hdr_vcf.get_header().infos: 4240 log.warning( 4241 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4242 ) 4243 if annotation_fields_new_name in self.get_header().infos: 4244 log.warning( 4245 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4246 ) 4247 4248 log.info( 4249 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4250 ) 4251 4252 annotation_infos = ",".join(annotation_list) 4253 4254 if annotation_infos != "": 4255 4256 # Protect header for bcftools (remove "#CHROM" and variants line) 4257 log.debug("Protect Header file - remove #CHROM line if exists") 4258 tmp_header_vcf = NamedTemporaryFile( 4259 prefix=self.get_prefix(), 4260 dir=self.get_tmp_dir(), 4261 suffix=".hdr", 4262 delete=False, 4263 ) 4264 tmp_header_vcf_name = tmp_header_vcf.name 4265 tmp_files.append(tmp_header_vcf_name) 4266 # Command 4267 if db_hdr_file.endswith(".gz"): 4268 command_extract_header = f"zcat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4269 else: 4270 command_extract_header = f"cat < {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4271 # Run 4272 run_parallel_commands([command_extract_header], 1) 4273 4274 # Find chomosomes 4275 log.debug("Find chromosomes ") 4276 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4277 sql_query_chromosomes_df = self.get_query_to_df( 4278 sql_query_chromosomes 4279 ) 4280 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4281 4282 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4283 4284 # BED columns in the annotation file 4285 if db_file_type in ["bed"]: 4286 annotation_infos = "CHROM,POS,POS," + annotation_infos 4287 4288 for chrom in chomosomes_list: 4289 4290 # Create BED on initial VCF 4291 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4292 tmp_bed = NamedTemporaryFile( 4293 prefix=self.get_prefix(), 4294 dir=self.get_tmp_dir(), 4295 suffix=".bed", 4296 delete=False, 4297 ) 4298 tmp_bed_name = tmp_bed.name 4299 tmp_files.append(tmp_bed_name) 4300 4301 # Detecte regions 4302 log.debug( 4303 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4304 ) 4305 window = 1000000 4306 sql_query_intervals_for_bed = f""" 4307 SELECT \"#CHROM\", 4308 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4309 \"POS\"+{window} 4310 FROM {table_variants} as table_variants 4311 WHERE table_variants.\"#CHROM\" = '{chrom}' 4312 """ 4313 regions = self.conn.execute( 4314 sql_query_intervals_for_bed 4315 ).fetchall() 4316 merged_regions = merge_regions(regions) 4317 log.debug( 4318 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4319 ) 4320 4321 header = ["#CHROM", "START", "END"] 4322 with open(tmp_bed_name, "w") as f: 4323 # Write the header with tab delimiter 4324 f.write("\t".join(header) + "\n") 4325 for d in merged_regions: 4326 # Write each data row with tab delimiter 4327 f.write("\t".join(map(str, d)) + "\n") 4328 4329 # Tmp files 4330 tmp_annotation_vcf = NamedTemporaryFile( 4331 prefix=self.get_prefix(), 4332 dir=self.get_tmp_dir(), 4333 suffix=".vcf.gz", 4334 delete=False, 4335 ) 4336 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4337 tmp_files.append(tmp_annotation_vcf_name) 4338 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4339 tmp_annotation_vcf_name_err = ( 4340 tmp_annotation_vcf_name + ".err" 4341 ) 4342 err_files.append(tmp_annotation_vcf_name_err) 4343 4344 # Annotate Command 4345 log.debug( 4346 f"Annotation '{annotation}' - add bcftools command" 4347 ) 4348 4349 # Command 4350 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4351 4352 # Add command 4353 commands.append(command_annotate) 4354 4355 # if some commands 4356 if commands: 4357 4358 # Export VCF file 4359 self.export_variant_vcf( 4360 vcf_file=tmp_vcf_name, 4361 remove_info=True, 4362 add_samples=False, 4363 index=True, 4364 ) 4365 4366 # Threads 4367 # calculate threads for annotated commands 4368 if commands: 4369 threads_bcftools_annotate = round(threads / len(commands)) 4370 else: 4371 threads_bcftools_annotate = 1 4372 4373 if not threads_bcftools_annotate: 4374 threads_bcftools_annotate = 1 4375 4376 # Add threads option to bcftools commands 4377 if threads_bcftools_annotate > 1: 4378 commands_threaded = [] 4379 for command in commands: 4380 commands_threaded.append( 4381 command.replace( 4382 f"{bcftools_bin_command} annotate ", 4383 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4384 ) 4385 ) 4386 commands = commands_threaded 4387 4388 # Command annotation multithreading 4389 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4390 log.info( 4391 f"Annotation - Annotation multithreaded in " 4392 + str(len(commands)) 4393 + " commands" 4394 ) 4395 4396 run_parallel_commands(commands, threads) 4397 4398 # Merge 4399 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4400 4401 if tmp_ann_vcf_list_cmd: 4402 4403 # Tmp file 4404 tmp_annotate_vcf = NamedTemporaryFile( 4405 prefix=self.get_prefix(), 4406 dir=self.get_tmp_dir(), 4407 suffix=".vcf.gz", 4408 delete=True, 4409 ) 4410 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4411 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4412 err_files.append(tmp_annotate_vcf_name_err) 4413 4414 # Tmp file remove command 4415 tmp_files_remove_command = "" 4416 if tmp_files: 4417 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4418 4419 # Command merge 4420 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4421 log.info( 4422 f"Annotation - Annotation merging " 4423 + str(len(commands)) 4424 + " annotated files" 4425 ) 4426 log.debug(f"Annotation - merge command: {merge_command}") 4427 run_parallel_commands([merge_command], 1) 4428 4429 # Error messages 4430 log.info(f"Error/Warning messages:") 4431 error_message_command_all = [] 4432 error_message_command_warning = [] 4433 error_message_command_err = [] 4434 for err_file in err_files: 4435 with open(err_file, "r") as f: 4436 for line in f: 4437 message = line.strip() 4438 error_message_command_all.append(message) 4439 if line.startswith("[W::"): 4440 error_message_command_warning.append(message) 4441 if line.startswith("[E::"): 4442 error_message_command_err.append( 4443 f"{err_file}: " + message 4444 ) 4445 # log info 4446 for message in list( 4447 set(error_message_command_err + error_message_command_warning) 4448 ): 4449 log.info(f" {message}") 4450 # debug info 4451 for message in list(set(error_message_command_all)): 4452 log.debug(f" {message}") 4453 # failed 4454 if len(error_message_command_err): 4455 log.error("Annotation failed: Error in commands") 4456 raise ValueError("Annotation failed: Error in commands") 4457 4458 # Update variants 4459 log.info(f"Annotation - Updating...") 4460 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4462 def annotation_exomiser(self, threads: int = None) -> None: 4463 """ 4464 This function annotate with Exomiser 4465 4466 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4467 - "analysis" (dict/file): 4468 Full analysis dictionnary parameters (see Exomiser docs). 4469 Either a dict, or a file in JSON or YAML format. 4470 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4471 Default : None 4472 - "preset" (string): 4473 Analysis preset (available in config folder). 4474 Used if no full "analysis" is provided. 4475 Default: "exome" 4476 - "phenopacket" (dict/file): 4477 Samples and phenotipic features parameters (see Exomiser docs). 4478 Either a dict, or a file in JSON or YAML format. 4479 Default: None 4480 - "subject" (dict): 4481 Sample parameters (see Exomiser docs). 4482 Example: 4483 "subject": 4484 { 4485 "id": "ISDBM322017", 4486 "sex": "FEMALE" 4487 } 4488 Default: None 4489 - "sample" (string): 4490 Sample name to construct "subject" section: 4491 "subject": 4492 { 4493 "id": "<sample>", 4494 "sex": "UNKNOWN_SEX" 4495 } 4496 Default: None 4497 - "phenotypicFeatures" (dict) 4498 Phenotypic features to construct "subject" section. 4499 Example: 4500 "phenotypicFeatures": 4501 [ 4502 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4503 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4504 ] 4505 - "hpo" (list) 4506 List of HPO ids as phenotypic features. 4507 Example: 4508 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4509 Default: [] 4510 - "outputOptions" (dict): 4511 Output options (see Exomiser docs). 4512 Default: 4513 "output_options" = 4514 { 4515 "outputContributingVariantsOnly": False, 4516 "numGenes": 0, 4517 "outputFormats": ["TSV_VARIANT", "VCF"] 4518 } 4519 - "transcript_source" (string): 4520 Transcript source (either "refseq", "ucsc", "ensembl") 4521 Default: "refseq" 4522 - "exomiser_to_info" (boolean): 4523 Add exomiser TSV file columns as INFO fields in VCF. 4524 Default: False 4525 - "release" (string): 4526 Exomise database release. 4527 If not exists, database release will be downloaded (take a while). 4528 Default: None (provided by application.properties configuration file) 4529 - "exomiser_application_properties" (file): 4530 Exomiser configuration file (see Exomiser docs). 4531 Useful to automatically download databases (especially for specific genome databases). 4532 4533 Notes: 4534 - If no sample in parameters, first sample in VCF will be chosen 4535 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4536 4537 :param threads: The number of threads to use 4538 :return: None. 4539 """ 4540 4541 # DEBUG 4542 log.debug("Start annotation with Exomiser databases") 4543 4544 # Threads 4545 if not threads: 4546 threads = self.get_threads() 4547 log.debug("Threads: " + str(threads)) 4548 4549 # Config 4550 config = self.get_config() 4551 log.debug("Config: " + str(config)) 4552 4553 # Config - Folders - Databases 4554 databases_folders = ( 4555 config.get("folders", {}) 4556 .get("databases", {}) 4557 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4558 ) 4559 databases_folders = full_path(databases_folders) 4560 if not os.path.exists(databases_folders): 4561 log.error(f"Databases annotations: {databases_folders} NOT found") 4562 log.debug("Databases annotations: " + str(databases_folders)) 4563 4564 # Config - Exomiser 4565 exomiser_bin_command = get_bin_command( 4566 bin="exomiser-cli*.jar", 4567 tool="exomiser", 4568 bin_type="jar", 4569 config=config, 4570 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4571 ) 4572 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4573 if not exomiser_bin_command: 4574 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4575 log.error(msg_err) 4576 raise ValueError(msg_err) 4577 4578 # Param 4579 param = self.get_param() 4580 log.debug("Param: " + str(param)) 4581 4582 # Param - Exomiser 4583 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4584 log.debug(f"Param Exomiser: {param_exomiser}") 4585 4586 # Param - Assembly 4587 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4588 log.debug("Assembly: " + str(assembly)) 4589 4590 # Data 4591 table_variants = self.get_table_variants() 4592 4593 # Check if not empty 4594 log.debug("Check if not empty") 4595 sql_query_chromosomes = ( 4596 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4597 ) 4598 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4599 log.info(f"VCF empty") 4600 return False 4601 4602 # VCF header 4603 vcf_reader = self.get_header() 4604 log.debug("Initial header: " + str(vcf_reader.infos)) 4605 4606 # Samples 4607 samples = self.get_header_sample_list() 4608 if not samples: 4609 log.error("No Samples in VCF") 4610 return False 4611 log.debug(f"Samples: {samples}") 4612 4613 # Memory limit 4614 memory_limit = self.get_memory("8G") 4615 log.debug(f"memory_limit: {memory_limit}") 4616 4617 # Exomiser java options 4618 exomiser_java_options = ( 4619 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4620 ) 4621 log.debug(f"Exomiser java options: {exomiser_java_options}") 4622 4623 # Download Exomiser (if not exists) 4624 exomiser_release = param_exomiser.get("release", None) 4625 exomiser_application_properties = param_exomiser.get( 4626 "exomiser_application_properties", None 4627 ) 4628 databases_download_exomiser( 4629 assemblies=[assembly], 4630 exomiser_folder=databases_folders, 4631 exomiser_release=exomiser_release, 4632 exomiser_phenotype_release=exomiser_release, 4633 exomiser_application_properties=exomiser_application_properties, 4634 ) 4635 4636 # Force annotation 4637 force_update_annotation = True 4638 4639 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4640 log.debug("Start annotation Exomiser") 4641 4642 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4643 4644 # tmp_dir = "/tmp/exomiser" 4645 4646 ### ANALYSIS ### 4647 ################ 4648 4649 # Create analysis.json through analysis dict 4650 # either analysis in param or by default 4651 # depending on preset exome/genome) 4652 4653 # Init analysis dict 4654 param_exomiser_analysis_dict = {} 4655 4656 # analysis from param 4657 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4658 param_exomiser_analysis = full_path(param_exomiser_analysis) 4659 4660 # If analysis in param -> load anlaysis json 4661 if param_exomiser_analysis: 4662 4663 # If param analysis is a file and exists 4664 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4665 param_exomiser_analysis 4666 ): 4667 # Load analysis file into analysis dict (either yaml or json) 4668 with open(param_exomiser_analysis) as json_file: 4669 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4670 4671 # If param analysis is a dict 4672 elif isinstance(param_exomiser_analysis, dict): 4673 # Load analysis dict into analysis dict (either yaml or json) 4674 param_exomiser_analysis_dict = param_exomiser_analysis 4675 4676 # Error analysis type 4677 else: 4678 log.error(f"Analysis type unknown. Check param file.") 4679 raise ValueError(f"Analysis type unknown. Check param file.") 4680 4681 # Case no input analysis config file/dict 4682 # Use preset (exome/genome) to open default config file 4683 if not param_exomiser_analysis_dict: 4684 4685 # default preset 4686 default_preset = "exome" 4687 4688 # Get param preset or default preset 4689 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4690 4691 # Try to find if preset is a file 4692 if os.path.exists(param_exomiser_preset): 4693 # Preset file is provided in full path 4694 param_exomiser_analysis_default_config_file = ( 4695 param_exomiser_preset 4696 ) 4697 # elif os.path.exists(full_path(param_exomiser_preset)): 4698 # # Preset file is provided in full path 4699 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4700 elif os.path.exists( 4701 os.path.join(folder_config, param_exomiser_preset) 4702 ): 4703 # Preset file is provided a basename in config folder (can be a path with subfolders) 4704 param_exomiser_analysis_default_config_file = os.path.join( 4705 folder_config, param_exomiser_preset 4706 ) 4707 else: 4708 # Construct preset file 4709 param_exomiser_analysis_default_config_file = os.path.join( 4710 folder_config, 4711 f"preset-{param_exomiser_preset}-analysis.json", 4712 ) 4713 4714 # If preset file exists 4715 param_exomiser_analysis_default_config_file = full_path( 4716 param_exomiser_analysis_default_config_file 4717 ) 4718 if os.path.exists(param_exomiser_analysis_default_config_file): 4719 # Load prest file into analysis dict (either yaml or json) 4720 with open( 4721 param_exomiser_analysis_default_config_file 4722 ) as json_file: 4723 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4724 json_file 4725 ) 4726 4727 # Error preset file 4728 else: 4729 log.error( 4730 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4731 ) 4732 raise ValueError( 4733 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4734 ) 4735 4736 # If no analysis dict created 4737 if not param_exomiser_analysis_dict: 4738 log.error(f"No analysis config") 4739 raise ValueError(f"No analysis config") 4740 4741 # Log 4742 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4743 4744 ### PHENOPACKET ### 4745 ################### 4746 4747 # If no PhenoPacket in analysis dict -> check in param 4748 if "phenopacket" not in param_exomiser_analysis_dict: 4749 4750 # If PhenoPacket in param -> load anlaysis json 4751 if param_exomiser.get("phenopacket", None): 4752 4753 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4754 param_exomiser_phenopacket = full_path( 4755 param_exomiser_phenopacket 4756 ) 4757 4758 # If param phenopacket is a file and exists 4759 if isinstance( 4760 param_exomiser_phenopacket, str 4761 ) and os.path.exists(param_exomiser_phenopacket): 4762 # Load phenopacket file into analysis dict (either yaml or json) 4763 with open(param_exomiser_phenopacket) as json_file: 4764 param_exomiser_analysis_dict["phenopacket"] = ( 4765 yaml.safe_load(json_file) 4766 ) 4767 4768 # If param phenopacket is a dict 4769 elif isinstance(param_exomiser_phenopacket, dict): 4770 # Load phenopacket dict into analysis dict (either yaml or json) 4771 param_exomiser_analysis_dict["phenopacket"] = ( 4772 param_exomiser_phenopacket 4773 ) 4774 4775 # Error phenopacket type 4776 else: 4777 log.error(f"Phenopacket type unknown. Check param file.") 4778 raise ValueError( 4779 f"Phenopacket type unknown. Check param file." 4780 ) 4781 4782 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4783 if "phenopacket" not in param_exomiser_analysis_dict: 4784 4785 # Init PhenoPacket 4786 param_exomiser_analysis_dict["phenopacket"] = { 4787 "id": "analysis", 4788 "proband": {}, 4789 } 4790 4791 ### Add subject ### 4792 4793 # If subject exists 4794 param_exomiser_subject = param_exomiser.get("subject", {}) 4795 4796 # If subject not exists -> found sample ID 4797 if not param_exomiser_subject: 4798 4799 # Found sample ID in param 4800 sample = param_exomiser.get("sample", None) 4801 4802 # Find sample ID (first sample) 4803 if not sample: 4804 sample_list = self.get_header_sample_list() 4805 if len(sample_list) > 0: 4806 sample = sample_list[0] 4807 else: 4808 log.error(f"No sample found") 4809 raise ValueError(f"No sample found") 4810 4811 # Create subject 4812 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4813 4814 # Add to dict 4815 param_exomiser_analysis_dict["phenopacket"][ 4816 "subject" 4817 ] = param_exomiser_subject 4818 4819 ### Add "phenotypicFeatures" ### 4820 4821 # If phenotypicFeatures exists 4822 param_exomiser_phenotypicfeatures = param_exomiser.get( 4823 "phenotypicFeatures", [] 4824 ) 4825 4826 # If phenotypicFeatures not exists -> Try to infer from hpo list 4827 if not param_exomiser_phenotypicfeatures: 4828 4829 # Found HPO in param 4830 param_exomiser_hpo = param_exomiser.get("hpo", []) 4831 4832 # Split HPO if list in string format separated by comma 4833 if isinstance(param_exomiser_hpo, str): 4834 param_exomiser_hpo = param_exomiser_hpo.split(",") 4835 4836 # Create HPO list 4837 for hpo in param_exomiser_hpo: 4838 hpo_clean = re.sub("[^0-9]", "", hpo) 4839 param_exomiser_phenotypicfeatures.append( 4840 { 4841 "type": { 4842 "id": f"HP:{hpo_clean}", 4843 "label": f"HP:{hpo_clean}", 4844 } 4845 } 4846 ) 4847 4848 # Add to dict 4849 param_exomiser_analysis_dict["phenopacket"][ 4850 "phenotypicFeatures" 4851 ] = param_exomiser_phenotypicfeatures 4852 4853 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4854 if not param_exomiser_phenotypicfeatures: 4855 for step in param_exomiser_analysis_dict.get( 4856 "analysis", {} 4857 ).get("steps", []): 4858 if "hiPhivePrioritiser" in step: 4859 param_exomiser_analysis_dict.get("analysis", {}).get( 4860 "steps", [] 4861 ).remove(step) 4862 4863 ### Add Input File ### 4864 4865 # Initial file name and htsFiles 4866 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4867 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4868 { 4869 "uri": tmp_vcf_name, 4870 "htsFormat": "VCF", 4871 "genomeAssembly": assembly, 4872 } 4873 ] 4874 4875 ### Add metaData ### 4876 4877 # If metaData not in analysis dict 4878 if "metaData" not in param_exomiser_analysis_dict: 4879 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4880 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4881 "createdBy": "howard", 4882 "phenopacketSchemaVersion": 1, 4883 } 4884 4885 ### OutputOptions ### 4886 4887 # Init output result folder 4888 output_results = os.path.join(tmp_dir, "results") 4889 4890 # If no outputOptions in analysis dict 4891 if "outputOptions" not in param_exomiser_analysis_dict: 4892 4893 # default output formats 4894 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4895 4896 # Get outputOptions in param 4897 output_options = param_exomiser.get("outputOptions", None) 4898 4899 # If no output_options in param -> check 4900 if not output_options: 4901 output_options = { 4902 "outputContributingVariantsOnly": False, 4903 "numGenes": 0, 4904 "outputFormats": defaut_output_formats, 4905 } 4906 4907 # Replace outputDirectory in output options 4908 output_options["outputDirectory"] = output_results 4909 output_options["outputFileName"] = "howard" 4910 4911 # Add outputOptions in analysis dict 4912 param_exomiser_analysis_dict["outputOptions"] = output_options 4913 4914 else: 4915 4916 # Replace output_results and output format (if exists in param) 4917 param_exomiser_analysis_dict["outputOptions"][ 4918 "outputDirectory" 4919 ] = output_results 4920 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4921 list( 4922 set( 4923 param_exomiser_analysis_dict.get( 4924 "outputOptions", {} 4925 ).get("outputFormats", []) 4926 + ["TSV_VARIANT", "VCF"] 4927 ) 4928 ) 4929 ) 4930 4931 # log 4932 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4933 4934 ### ANALYSIS FILE ### 4935 ##################### 4936 4937 ### Full JSON analysis config file ### 4938 4939 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4940 with open(exomiser_analysis, "w") as fp: 4941 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4942 4943 ### SPLIT analysis and sample config files 4944 4945 # Splitted analysis dict 4946 param_exomiser_analysis_dict_for_split = ( 4947 param_exomiser_analysis_dict.copy() 4948 ) 4949 4950 # Phenopacket JSON file 4951 exomiser_analysis_phenopacket = os.path.join( 4952 tmp_dir, "analysis_phenopacket.json" 4953 ) 4954 with open(exomiser_analysis_phenopacket, "w") as fp: 4955 json.dump( 4956 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4957 fp, 4958 indent=4, 4959 ) 4960 4961 # Analysis JSON file without Phenopacket parameters 4962 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4963 exomiser_analysis_analysis = os.path.join( 4964 tmp_dir, "analysis_analysis.json" 4965 ) 4966 with open(exomiser_analysis_analysis, "w") as fp: 4967 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4968 4969 ### INITAL VCF file ### 4970 ####################### 4971 4972 ### Create list of samples to use and include inti initial VCF file #### 4973 4974 # Subject (main sample) 4975 # Get sample ID in analysis dict 4976 sample_subject = ( 4977 param_exomiser_analysis_dict.get("phenopacket", {}) 4978 .get("subject", {}) 4979 .get("id", None) 4980 ) 4981 sample_proband = ( 4982 param_exomiser_analysis_dict.get("phenopacket", {}) 4983 .get("proband", {}) 4984 .get("subject", {}) 4985 .get("id", None) 4986 ) 4987 sample = [] 4988 if sample_subject: 4989 sample.append(sample_subject) 4990 if sample_proband: 4991 sample.append(sample_proband) 4992 4993 # Get sample ID within Pedigree 4994 pedigree_persons_list = ( 4995 param_exomiser_analysis_dict.get("phenopacket", {}) 4996 .get("pedigree", {}) 4997 .get("persons", {}) 4998 ) 4999 5000 # Create list with all sample ID in pedigree (if exists) 5001 pedigree_persons = [] 5002 for person in pedigree_persons_list: 5003 pedigree_persons.append(person.get("individualId")) 5004 5005 # Concat subject sample ID and samples ID in pedigreesamples 5006 samples = list(set(sample + pedigree_persons)) 5007 5008 # Check if sample list is not empty 5009 if not samples: 5010 log.error(f"No samples found") 5011 raise ValueError(f"No samples found") 5012 5013 # Create VCF with sample (either sample in param or first one by default) 5014 # Export VCF file 5015 self.export_variant_vcf( 5016 vcf_file=tmp_vcf_name, 5017 remove_info=True, 5018 add_samples=True, 5019 list_samples=samples, 5020 index=False, 5021 ) 5022 5023 ### Execute Exomiser ### 5024 ######################## 5025 5026 # Init command 5027 exomiser_command = "" 5028 5029 # Command exomiser options 5030 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 5031 5032 # Release 5033 exomiser_release = param_exomiser.get("release", None) 5034 if exomiser_release: 5035 # phenotype data version 5036 exomiser_options += ( 5037 f" --exomiser.phenotype.data-version={exomiser_release} " 5038 ) 5039 # data version 5040 exomiser_options += ( 5041 f" --exomiser.{assembly}.data-version={exomiser_release} " 5042 ) 5043 # variant white list 5044 variant_white_list_file = ( 5045 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 5046 ) 5047 if os.path.exists( 5048 os.path.join( 5049 databases_folders, assembly, variant_white_list_file 5050 ) 5051 ): 5052 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 5053 5054 # transcript_source 5055 transcript_source = param_exomiser.get( 5056 "transcript_source", None 5057 ) # ucsc, refseq, ensembl 5058 if transcript_source: 5059 exomiser_options += ( 5060 f" --exomiser.{assembly}.transcript-source={transcript_source} " 5061 ) 5062 5063 # If analysis contain proband param 5064 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 5065 "proband", {} 5066 ): 5067 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 5068 5069 # If no proband (usually uniq sample) 5070 else: 5071 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5072 5073 # Log 5074 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5075 5076 # Run command 5077 result = subprocess.call( 5078 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5079 ) 5080 if result: 5081 log.error("Exomiser command failed") 5082 raise ValueError("Exomiser command failed") 5083 5084 ### RESULTS ### 5085 ############### 5086 5087 ### Annotate with TSV fields ### 5088 5089 # Init result tsv file 5090 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5091 5092 # Init result tsv file 5093 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5094 5095 # Parse TSV file and explode columns in INFO field 5096 if exomiser_to_info and os.path.exists(output_results_tsv): 5097 5098 # Log 5099 log.debug("Exomiser columns to VCF INFO field") 5100 5101 # Retrieve columns and types 5102 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5103 output_results_tsv_df = self.get_query_to_df(query) 5104 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5105 5106 # Init concat fields for update 5107 sql_query_update_concat_fields = [] 5108 5109 # Fields to avoid 5110 fields_to_avoid = [ 5111 "CONTIG", 5112 "START", 5113 "END", 5114 "REF", 5115 "ALT", 5116 "QUAL", 5117 "FILTER", 5118 "GENOTYPE", 5119 ] 5120 5121 # List all columns to add into header 5122 for header_column in output_results_tsv_columns: 5123 5124 # If header column is enable 5125 if header_column not in fields_to_avoid: 5126 5127 # Header info type 5128 header_info_type = "String" 5129 header_column_df = output_results_tsv_df[header_column] 5130 header_column_df_dtype = header_column_df.dtype 5131 if header_column_df_dtype == object: 5132 if ( 5133 pd.to_numeric(header_column_df, errors="coerce") 5134 .notnull() 5135 .all() 5136 ): 5137 header_info_type = "Float" 5138 else: 5139 header_info_type = "Integer" 5140 5141 # Header info 5142 characters_to_validate = ["-"] 5143 pattern = "[" + "".join(characters_to_validate) + "]" 5144 header_info_name = re.sub( 5145 pattern, 5146 "_", 5147 f"Exomiser_{header_column}".replace("#", ""), 5148 ) 5149 header_info_number = "." 5150 header_info_description = ( 5151 f"Exomiser {header_column} annotation" 5152 ) 5153 header_info_source = "Exomiser" 5154 header_info_version = "unknown" 5155 header_info_code = CODE_TYPE_MAP[header_info_type] 5156 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5157 header_info_name, 5158 header_info_number, 5159 header_info_type, 5160 header_info_description, 5161 header_info_source, 5162 header_info_version, 5163 header_info_code, 5164 ) 5165 5166 # Add field to add for update to concat fields 5167 sql_query_update_concat_fields.append( 5168 f""" 5169 CASE 5170 WHEN table_parquet."{header_column}" NOT IN ('','.') 5171 THEN concat( 5172 '{header_info_name}=', 5173 table_parquet."{header_column}", 5174 ';' 5175 ) 5176 5177 ELSE '' 5178 END 5179 """ 5180 ) 5181 5182 # Update query 5183 sql_query_update = f""" 5184 UPDATE {table_variants} as table_variants 5185 SET INFO = concat( 5186 CASE 5187 WHEN INFO NOT IN ('', '.') 5188 THEN INFO 5189 ELSE '' 5190 END, 5191 CASE 5192 WHEN table_variants.INFO NOT IN ('','.') 5193 THEN ';' 5194 ELSE '' 5195 END, 5196 ( 5197 SELECT 5198 concat( 5199 {",".join(sql_query_update_concat_fields)} 5200 ) 5201 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5202 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5203 AND table_parquet.\"START\" = table_variants.\"POS\" 5204 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5205 AND table_parquet.\"REF\" = table_variants.\"REF\" 5206 ) 5207 ) 5208 ; 5209 """ 5210 5211 # Update 5212 self.conn.execute(sql_query_update) 5213 5214 ### Annotate with VCF INFO field ### 5215 5216 # Init result VCF file 5217 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5218 5219 # If VCF exists 5220 if os.path.exists(output_results_vcf): 5221 5222 # Log 5223 log.debug("Exomiser result VCF update variants") 5224 5225 # Find Exomiser INFO field annotation in header 5226 with gzip.open(output_results_vcf, "rt") as f: 5227 header_list = self.read_vcf_header(f) 5228 exomiser_vcf_header = vcf.Reader( 5229 io.StringIO("\n".join(header_list)) 5230 ) 5231 5232 # Add annotation INFO field to header 5233 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5234 5235 # Update variants with VCF 5236 self.update_from_vcf(output_results_vcf) 5237 5238 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5240 def annotation_snpeff(self, threads: int = None) -> None: 5241 """ 5242 This function annotate with snpEff 5243 5244 :param threads: The number of threads to use 5245 :return: the value of the variable "return_value". 5246 """ 5247 5248 # DEBUG 5249 log.debug("Start annotation with snpeff databases") 5250 5251 # Threads 5252 if not threads: 5253 threads = self.get_threads() 5254 log.debug("Threads: " + str(threads)) 5255 5256 # DEBUG 5257 delete_tmp = True 5258 if self.get_config().get("verbosity", "warning") in ["debug"]: 5259 delete_tmp = False 5260 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5261 5262 # Config 5263 config = self.get_config() 5264 log.debug("Config: " + str(config)) 5265 5266 # Config - Folders - Databases 5267 databases_folders = ( 5268 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5269 ) 5270 log.debug("Databases annotations: " + str(databases_folders)) 5271 5272 # Config - snpEff bin command 5273 snpeff_bin_command = get_bin_command( 5274 bin="snpEff.jar", 5275 tool="snpeff", 5276 bin_type="jar", 5277 config=config, 5278 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5279 ) 5280 if not snpeff_bin_command: 5281 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5282 log.error(msg_err) 5283 raise ValueError(msg_err) 5284 5285 # Config - snpEff databases 5286 snpeff_databases = ( 5287 config.get("folders", {}) 5288 .get("databases", {}) 5289 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5290 ) 5291 snpeff_databases = full_path(snpeff_databases) 5292 if snpeff_databases is not None and snpeff_databases != "": 5293 log.debug(f"Create snpEff databases folder") 5294 if not os.path.exists(snpeff_databases): 5295 os.makedirs(snpeff_databases) 5296 5297 # Param 5298 param = self.get_param() 5299 log.debug("Param: " + str(param)) 5300 5301 # Param 5302 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5303 log.debug("Options: " + str(options)) 5304 5305 # Param - Assembly 5306 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5307 5308 # Param - Options 5309 snpeff_options = ( 5310 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5311 ) 5312 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5313 snpeff_csvstats = ( 5314 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5315 ) 5316 if snpeff_stats: 5317 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5318 snpeff_stats = full_path(snpeff_stats) 5319 snpeff_options += f" -stats {snpeff_stats}" 5320 if snpeff_csvstats: 5321 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5322 snpeff_csvstats = full_path(snpeff_csvstats) 5323 snpeff_options += f" -csvStats {snpeff_csvstats}" 5324 5325 # Data 5326 table_variants = self.get_table_variants() 5327 5328 # Check if not empty 5329 log.debug("Check if not empty") 5330 sql_query_chromosomes = ( 5331 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5332 ) 5333 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5334 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5335 log.info(f"VCF empty") 5336 return 5337 5338 # Export in VCF 5339 log.debug("Create initial file to annotate") 5340 tmp_vcf = NamedTemporaryFile( 5341 prefix=self.get_prefix(), 5342 dir=self.get_tmp_dir(), 5343 suffix=".vcf.gz", 5344 delete=True, 5345 ) 5346 tmp_vcf_name = tmp_vcf.name 5347 5348 # VCF header 5349 vcf_reader = self.get_header() 5350 log.debug("Initial header: " + str(vcf_reader.infos)) 5351 5352 # Existing annotations 5353 for vcf_annotation in self.get_header().infos: 5354 5355 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5356 log.debug( 5357 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5358 ) 5359 5360 # Memory limit 5361 # if config.get("memory", None): 5362 # memory_limit = config.get("memory", "8G") 5363 # else: 5364 # memory_limit = "8G" 5365 memory_limit = self.get_memory("8G") 5366 log.debug(f"memory_limit: {memory_limit}") 5367 5368 # snpEff java options 5369 snpeff_java_options = ( 5370 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5371 ) 5372 log.debug(f"Exomiser java options: {snpeff_java_options}") 5373 5374 force_update_annotation = True 5375 5376 if "ANN" not in self.get_header().infos or force_update_annotation: 5377 5378 # Check snpEff database 5379 log.debug(f"Check snpEff databases {[assembly]}") 5380 databases_download_snpeff( 5381 folder=snpeff_databases, assemblies=[assembly], config=config 5382 ) 5383 5384 # Export VCF file 5385 self.export_variant_vcf( 5386 vcf_file=tmp_vcf_name, 5387 remove_info=True, 5388 add_samples=False, 5389 index=True, 5390 ) 5391 5392 # Tmp file 5393 err_files = [] 5394 tmp_annotate_vcf = NamedTemporaryFile( 5395 prefix=self.get_prefix(), 5396 dir=self.get_tmp_dir(), 5397 suffix=".vcf", 5398 delete=False, 5399 ) 5400 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5401 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5402 err_files.append(tmp_annotate_vcf_name_err) 5403 5404 # Command 5405 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5406 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5407 run_parallel_commands([snpeff_command], 1) 5408 5409 # Error messages 5410 log.info(f"Error/Warning messages:") 5411 error_message_command_all = [] 5412 error_message_command_warning = [] 5413 error_message_command_err = [] 5414 for err_file in err_files: 5415 with open(err_file, "r") as f: 5416 for line in f: 5417 message = line.strip() 5418 error_message_command_all.append(message) 5419 if line.startswith("[W::"): 5420 error_message_command_warning.append(message) 5421 if line.startswith("[E::"): 5422 error_message_command_err.append(f"{err_file}: " + message) 5423 # log info 5424 for message in list( 5425 set(error_message_command_err + error_message_command_warning) 5426 ): 5427 log.info(f" {message}") 5428 # debug info 5429 for message in list(set(error_message_command_all)): 5430 log.debug(f" {message}") 5431 # failed 5432 if len(error_message_command_err): 5433 log.error("Annotation failed: Error in commands") 5434 raise ValueError("Annotation failed: Error in commands") 5435 5436 # Find annotation in header 5437 with open(tmp_annotate_vcf_name, "rt") as f: 5438 header_list = self.read_vcf_header(f) 5439 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5440 5441 for ann in annovar_vcf_header.infos: 5442 if ann not in self.get_header().infos: 5443 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5444 5445 # Update variants 5446 log.info(f"Annotation - Updating...") 5447 self.update_from_vcf(tmp_annotate_vcf_name) 5448 5449 else: 5450 if "ANN" in self.get_header().infos: 5451 log.debug(f"Existing snpEff annotations in VCF") 5452 if force_update_annotation: 5453 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5455 def annotation_annovar(self, threads: int = None) -> None: 5456 """ 5457 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5458 annotations 5459 5460 :param threads: number of threads to use 5461 :return: the value of the variable "return_value". 5462 """ 5463 5464 # DEBUG 5465 log.debug("Start annotation with Annovar databases") 5466 5467 # Threads 5468 if not threads: 5469 threads = self.get_threads() 5470 log.debug("Threads: " + str(threads)) 5471 5472 # Tmp en Err files 5473 tmp_files = [] 5474 err_files = [] 5475 5476 # DEBUG 5477 delete_tmp = True 5478 if self.get_config().get("verbosity", "warning") in ["debug"]: 5479 delete_tmp = False 5480 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5481 5482 # Config 5483 config = self.get_config() 5484 log.debug("Config: " + str(config)) 5485 5486 # Config - Folders - Databases 5487 databases_folders = ( 5488 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5489 ) 5490 log.debug("Databases annotations: " + str(databases_folders)) 5491 5492 # Config - annovar bin command 5493 annovar_bin_command = get_bin_command( 5494 bin="table_annovar.pl", 5495 tool="annovar", 5496 bin_type="perl", 5497 config=config, 5498 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5499 ) 5500 if not annovar_bin_command: 5501 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5502 log.error(msg_err) 5503 raise ValueError(msg_err) 5504 5505 # Config - BCFTools bin command 5506 bcftools_bin_command = get_bin_command( 5507 bin="bcftools", 5508 tool="bcftools", 5509 bin_type="bin", 5510 config=config, 5511 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5512 ) 5513 if not bcftools_bin_command: 5514 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5515 log.error(msg_err) 5516 raise ValueError(msg_err) 5517 5518 # Config - annovar databases 5519 annovar_databases = ( 5520 config.get("folders", {}) 5521 .get("databases", {}) 5522 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5523 ) 5524 if annovar_databases is not None: 5525 if isinstance(annovar_databases, list): 5526 annovar_databases = full_path(annovar_databases[0]) 5527 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5528 annovar_databases = full_path(annovar_databases) 5529 if not os.path.exists(annovar_databases): 5530 log.info(f"Annovar databases folder '{annovar_databases}' created") 5531 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5532 else: 5533 msg_err = f"Annovar databases configuration failed" 5534 log.error(msg_err) 5535 raise ValueError(msg_err) 5536 5537 # Param 5538 param = self.get_param() 5539 log.debug("Param: " + str(param)) 5540 5541 # Param - options 5542 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5543 log.debug("Options: " + str(options)) 5544 5545 # Param - annotations 5546 annotations = ( 5547 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5548 ) 5549 log.debug("Annotations: " + str(annotations)) 5550 5551 # Param - Assembly 5552 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5553 5554 # Annovar database assembly 5555 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5556 if annovar_databases_assembly != "" and not os.path.exists( 5557 annovar_databases_assembly 5558 ): 5559 os.makedirs(annovar_databases_assembly) 5560 5561 # Data 5562 table_variants = self.get_table_variants() 5563 5564 # Check if not empty 5565 log.debug("Check if not empty") 5566 sql_query_chromosomes = ( 5567 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5568 ) 5569 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5570 if not sql_query_chromosomes_df["count"][0]: 5571 log.info(f"VCF empty") 5572 return 5573 5574 # VCF header 5575 vcf_reader = self.get_header() 5576 log.debug("Initial header: " + str(vcf_reader.infos)) 5577 5578 # Existing annotations 5579 for vcf_annotation in self.get_header().infos: 5580 5581 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5582 log.debug( 5583 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5584 ) 5585 5586 force_update_annotation = True 5587 5588 if annotations: 5589 5590 commands = [] 5591 tmp_annotates_vcf_name_list = [] 5592 5593 # Export in VCF 5594 log.debug("Create initial file to annotate") 5595 tmp_vcf = NamedTemporaryFile( 5596 prefix=self.get_prefix(), 5597 dir=self.get_tmp_dir(), 5598 suffix=".vcf.gz", 5599 delete=False, 5600 ) 5601 tmp_vcf_name = tmp_vcf.name 5602 tmp_files.append(tmp_vcf_name) 5603 tmp_files.append(tmp_vcf_name + ".tbi") 5604 5605 # Export VCF file 5606 self.export_variant_vcf( 5607 vcf_file=tmp_vcf_name, 5608 remove_info=".", 5609 add_samples=False, 5610 index=True, 5611 ) 5612 5613 # Create file for field rename 5614 log.debug("Create file for field rename") 5615 tmp_rename = NamedTemporaryFile( 5616 prefix=self.get_prefix(), 5617 dir=self.get_tmp_dir(), 5618 suffix=".rename", 5619 delete=False, 5620 ) 5621 tmp_rename_name = tmp_rename.name 5622 tmp_files.append(tmp_rename_name) 5623 5624 # Check Annovar database 5625 log.debug( 5626 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5627 ) 5628 databases_download_annovar( 5629 folder=annovar_databases, 5630 files=list(annotations.keys()), 5631 assemblies=[assembly], 5632 ) 5633 5634 for annotation in annotations: 5635 annotation_fields = annotations[annotation] 5636 5637 if not annotation_fields: 5638 annotation_fields = {"INFO": None} 5639 5640 log.info(f"Annotations Annovar - database '{annotation}'") 5641 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5642 5643 # Tmp file for annovar 5644 err_files = [] 5645 tmp_annotate_vcf_directory = TemporaryDirectory( 5646 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5647 ) 5648 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5649 tmp_annotate_vcf_name_annovar = ( 5650 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5651 ) 5652 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5653 err_files.append(tmp_annotate_vcf_name_err) 5654 tmp_files.append(tmp_annotate_vcf_name_err) 5655 5656 # Tmp file final vcf annotated by annovar 5657 tmp_annotate_vcf = NamedTemporaryFile( 5658 prefix=self.get_prefix(), 5659 dir=self.get_tmp_dir(), 5660 suffix=".vcf.gz", 5661 delete=False, 5662 ) 5663 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5664 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5665 tmp_files.append(tmp_annotate_vcf_name) 5666 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5667 5668 # Number of fields 5669 annotation_list = [] 5670 annotation_renamed_list = [] 5671 5672 for annotation_field in annotation_fields: 5673 5674 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5675 annotation_fields_new_name = annotation_fields.get( 5676 annotation_field, annotation_field 5677 ) 5678 if not annotation_fields_new_name: 5679 annotation_fields_new_name = annotation_field 5680 5681 if ( 5682 force_update_annotation 5683 or annotation_fields_new_name not in self.get_header().infos 5684 ): 5685 annotation_list.append(annotation_field) 5686 annotation_renamed_list.append(annotation_fields_new_name) 5687 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5688 log.warning( 5689 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5690 ) 5691 5692 # Add rename info 5693 run_parallel_commands( 5694 [ 5695 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5696 ], 5697 1, 5698 ) 5699 5700 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5701 log.debug("annotation_list: " + str(annotation_list)) 5702 5703 # protocol 5704 protocol = annotation 5705 5706 # argument 5707 argument = "" 5708 5709 # operation 5710 operation = "f" 5711 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5712 "ensGene" 5713 ): 5714 operation = "g" 5715 if options.get("genebase", None): 5716 argument = f"""'{options.get("genebase","")}'""" 5717 elif annotation in ["cytoBand"]: 5718 operation = "r" 5719 5720 # argument option 5721 argument_option = "" 5722 if argument != "": 5723 argument_option = " --argument " + argument 5724 5725 # command options 5726 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5727 for option in options: 5728 if option not in ["genebase"]: 5729 command_options += f""" --{option}={options[option]}""" 5730 5731 # Command 5732 5733 # Command - Annovar 5734 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5735 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5736 5737 # Command - start pipe 5738 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5739 5740 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5741 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5742 5743 # Command - Special characters (refGene annotation) 5744 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5745 5746 # Command - Clean empty fields (with value ".") 5747 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5748 5749 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5750 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5751 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5752 # for ann in annotation_renamed_list: 5753 for ann in annotation_list: 5754 annovar_fields_to_keep.append(f"^INFO/{ann}") 5755 5756 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5757 5758 # Command - indexing 5759 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5760 5761 log.debug(f"Annotation - Annovar command: {command_annovar}") 5762 run_parallel_commands([command_annovar], 1) 5763 5764 # Error messages 5765 log.info(f"Error/Warning messages:") 5766 error_message_command_all = [] 5767 error_message_command_warning = [] 5768 error_message_command_err = [] 5769 for err_file in err_files: 5770 with open(err_file, "r") as f: 5771 for line in f: 5772 message = line.strip() 5773 error_message_command_all.append(message) 5774 if line.startswith("[W::") or line.startswith("WARNING"): 5775 error_message_command_warning.append(message) 5776 if line.startswith("[E::") or line.startswith("ERROR"): 5777 error_message_command_err.append( 5778 f"{err_file}: " + message 5779 ) 5780 # log info 5781 for message in list( 5782 set(error_message_command_err + error_message_command_warning) 5783 ): 5784 log.info(f" {message}") 5785 # debug info 5786 for message in list(set(error_message_command_all)): 5787 log.debug(f" {message}") 5788 # failed 5789 if len(error_message_command_err): 5790 log.error("Annotation failed: Error in commands") 5791 raise ValueError("Annotation failed: Error in commands") 5792 5793 if tmp_annotates_vcf_name_list: 5794 5795 # List of annotated files 5796 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5797 5798 # Tmp file 5799 tmp_annotate_vcf = NamedTemporaryFile( 5800 prefix=self.get_prefix(), 5801 dir=self.get_tmp_dir(), 5802 suffix=".vcf.gz", 5803 delete=False, 5804 ) 5805 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5806 tmp_files.append(tmp_annotate_vcf_name) 5807 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5808 err_files.append(tmp_annotate_vcf_name_err) 5809 tmp_files.append(tmp_annotate_vcf_name_err) 5810 5811 # Command merge 5812 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5813 log.info( 5814 f"Annotation Annovar - Annotation merging " 5815 + str(len(tmp_annotates_vcf_name_list)) 5816 + " annotated files" 5817 ) 5818 log.debug(f"Annotation - merge command: {merge_command}") 5819 run_parallel_commands([merge_command], 1) 5820 5821 # Find annotation in header 5822 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5823 header_list = self.read_vcf_header(f) 5824 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5825 5826 for ann in annovar_vcf_header.infos: 5827 if ann not in self.get_header().infos: 5828 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5829 5830 # Update variants 5831 log.info(f"Annotation Annovar - Updating...") 5832 self.update_from_vcf(tmp_annotate_vcf_name) 5833 5834 # Clean files 5835 # Tmp file remove command 5836 if True: 5837 tmp_files_remove_command = "" 5838 if tmp_files: 5839 tmp_files_remove_command = " ".join(tmp_files) 5840 clean_command = f" rm -f {tmp_files_remove_command} " 5841 log.debug(f"Annotation Annovar - Annotation cleaning ") 5842 log.debug(f"Annotation - cleaning command: {clean_command}") 5843 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5846 def annotation_parquet(self, threads: int = None) -> None: 5847 """ 5848 It takes a VCF file, and annotates it with a parquet file 5849 5850 :param threads: number of threads to use for the annotation 5851 :return: the value of the variable "result". 5852 """ 5853 5854 # DEBUG 5855 log.debug("Start annotation with parquet databases") 5856 5857 # Threads 5858 if not threads: 5859 threads = self.get_threads() 5860 log.debug("Threads: " + str(threads)) 5861 5862 # DEBUG 5863 delete_tmp = True 5864 if self.get_config().get("verbosity", "warning") in ["debug"]: 5865 delete_tmp = False 5866 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5867 5868 # Config 5869 databases_folders = set( 5870 self.get_config() 5871 .get("folders", {}) 5872 .get("databases", {}) 5873 .get("annotations", ["."]) 5874 + self.get_config() 5875 .get("folders", {}) 5876 .get("databases", {}) 5877 .get("parquet", ["."]) 5878 ) 5879 log.debug("Databases annotations: " + str(databases_folders)) 5880 5881 # Param 5882 annotations = ( 5883 self.get_param() 5884 .get("annotation", {}) 5885 .get("parquet", {}) 5886 .get("annotations", None) 5887 ) 5888 log.debug("Annotations: " + str(annotations)) 5889 5890 # Assembly 5891 assembly = self.get_param().get( 5892 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5893 ) 5894 5895 # Force Update Annotation 5896 force_update_annotation = ( 5897 self.get_param() 5898 .get("annotation", {}) 5899 .get("options", {}) 5900 .get("annotations_update", False) 5901 ) 5902 log.debug(f"force_update_annotation={force_update_annotation}") 5903 force_append_annotation = ( 5904 self.get_param() 5905 .get("annotation", {}) 5906 .get("options", {}) 5907 .get("annotations_append", False) 5908 ) 5909 log.debug(f"force_append_annotation={force_append_annotation}") 5910 5911 # Data 5912 table_variants = self.get_table_variants() 5913 5914 # Check if not empty 5915 log.debug("Check if not empty") 5916 sql_query_chromosomes_df = self.get_query_to_df( 5917 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5918 ) 5919 if not sql_query_chromosomes_df["count"][0]: 5920 log.info(f"VCF empty") 5921 return 5922 5923 # VCF header 5924 vcf_reader = self.get_header() 5925 log.debug("Initial header: " + str(vcf_reader.infos)) 5926 5927 # Nb Variants POS 5928 log.debug("NB Variants Start") 5929 nb_variants = self.conn.execute( 5930 f"SELECT count(*) AS count FROM variants" 5931 ).fetchdf()["count"][0] 5932 log.debug("NB Variants Stop") 5933 5934 # Existing annotations 5935 for vcf_annotation in self.get_header().infos: 5936 5937 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5938 log.debug( 5939 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5940 ) 5941 5942 # Added columns 5943 added_columns = [] 5944 5945 # drop indexes 5946 log.debug(f"Drop indexes...") 5947 self.drop_indexes() 5948 5949 if annotations: 5950 5951 if "ALL" in annotations: 5952 5953 all_param = annotations.get("ALL", {}) 5954 all_param_formats = all_param.get("formats", None) 5955 all_param_releases = all_param.get("releases", None) 5956 5957 databases_infos_dict = self.scan_databases( 5958 database_formats=all_param_formats, 5959 database_releases=all_param_releases, 5960 ) 5961 for database_infos in databases_infos_dict.keys(): 5962 if database_infos not in annotations: 5963 annotations[database_infos] = {"INFO": None} 5964 5965 for annotation in annotations: 5966 5967 if annotation in ["ALL"]: 5968 continue 5969 5970 # Annotation Name 5971 annotation_name = os.path.basename(annotation) 5972 5973 # Annotation fields 5974 annotation_fields = annotations[annotation] 5975 if not annotation_fields: 5976 annotation_fields = {"INFO": None} 5977 5978 log.debug(f"Annotation '{annotation_name}'") 5979 log.debug( 5980 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5981 ) 5982 5983 # Create Database 5984 database = Database( 5985 database=annotation, 5986 databases_folders=databases_folders, 5987 assembly=assembly, 5988 ) 5989 5990 # Find files 5991 parquet_file = database.get_database() 5992 parquet_hdr_file = database.get_header_file() 5993 parquet_type = database.get_type() 5994 5995 # Check if files exists 5996 if not parquet_file or not parquet_hdr_file: 5997 msg_err_list = [] 5998 if not parquet_file: 5999 msg_err_list.append( 6000 f"Annotation failed: Annotation file not found" 6001 ) 6002 if parquet_file and not parquet_hdr_file: 6003 msg_err_list.append( 6004 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 6005 ) 6006 6007 log.error(". ".join(msg_err_list)) 6008 raise ValueError(". ".join(msg_err_list)) 6009 else: 6010 # Get parquet connexion 6011 parquet_sql_attach = database.get_sql_database_attach( 6012 output="query" 6013 ) 6014 if parquet_sql_attach: 6015 self.conn.execute(parquet_sql_attach) 6016 parquet_file_link = database.get_sql_database_link() 6017 # Log 6018 log.debug( 6019 f"Annotation '{annotation_name}' - file: " 6020 + str(parquet_file) 6021 + " and " 6022 + str(parquet_hdr_file) 6023 ) 6024 6025 # Database full header columns 6026 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 6027 parquet_hdr_file 6028 ) 6029 # Log 6030 log.debug( 6031 "Annotation database header columns : " 6032 + str(parquet_hdr_vcf_header_columns) 6033 ) 6034 6035 # Load header as VCF object 6036 parquet_hdr_vcf_header_infos = database.get_header().infos 6037 # Log 6038 log.debug( 6039 "Annotation database header: " 6040 + str(parquet_hdr_vcf_header_infos) 6041 ) 6042 6043 # Get extra infos 6044 parquet_columns = database.get_extra_columns() 6045 # Log 6046 log.debug("Annotation database Columns: " + str(parquet_columns)) 6047 6048 # Add extra columns if "ALL" in annotation_fields 6049 # if "ALL" in annotation_fields: 6050 # allow_add_extra_column = True 6051 if "ALL" in annotation_fields and database.get_extra_columns(): 6052 for extra_column in database.get_extra_columns(): 6053 if ( 6054 extra_column not in annotation_fields 6055 and extra_column.replace("INFO/", "") 6056 not in parquet_hdr_vcf_header_infos 6057 ): 6058 parquet_hdr_vcf_header_infos[extra_column] = ( 6059 vcf.parser._Info( 6060 extra_column, 6061 ".", 6062 "String", 6063 f"{extra_column} description", 6064 "unknown", 6065 "unknown", 6066 self.code_type_map["String"], 6067 ) 6068 ) 6069 6070 # For all fields in database 6071 annotation_fields_all = False 6072 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6073 annotation_fields_all = True 6074 annotation_fields = { 6075 key: key for key in parquet_hdr_vcf_header_infos 6076 } 6077 6078 log.debug( 6079 "Annotation database header - All annotations added: " 6080 + str(annotation_fields) 6081 ) 6082 6083 # Init 6084 6085 # List of annotation fields to use 6086 sql_query_annotation_update_info_sets = [] 6087 6088 # List of annotation to agregate 6089 sql_query_annotation_to_agregate = [] 6090 6091 # Number of fields 6092 nb_annotation_field = 0 6093 6094 # Annotation fields processed 6095 annotation_fields_processed = [] 6096 6097 # Columns mapping 6098 map_columns = database.map_columns( 6099 columns=annotation_fields, prefixes=["INFO/"] 6100 ) 6101 6102 # Query dict for fields to remove (update option) 6103 query_dict_remove = {} 6104 6105 # Fetch Anotation fields 6106 for annotation_field in annotation_fields: 6107 6108 # annotation_field_column 6109 annotation_field_column = map_columns.get( 6110 annotation_field, "INFO" 6111 ) 6112 6113 # field new name, if parametered 6114 annotation_fields_new_name = annotation_fields.get( 6115 annotation_field, annotation_field 6116 ) 6117 if not annotation_fields_new_name: 6118 annotation_fields_new_name = annotation_field 6119 6120 # To annotate 6121 # force_update_annotation = True 6122 # force_append_annotation = True 6123 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6124 if annotation_field in parquet_hdr_vcf_header_infos and ( 6125 force_update_annotation 6126 or force_append_annotation 6127 or ( 6128 annotation_fields_new_name 6129 not in self.get_header().infos 6130 ) 6131 ): 6132 6133 # Add field to annotation to process list 6134 annotation_fields_processed.append( 6135 annotation_fields_new_name 6136 ) 6137 6138 # explode infos for the field 6139 annotation_fields_new_name_info_msg = "" 6140 if ( 6141 force_update_annotation 6142 and annotation_fields_new_name 6143 in self.get_header().infos 6144 ): 6145 # Remove field from INFO 6146 query = f""" 6147 UPDATE {table_variants} as table_variants 6148 SET INFO = REGEXP_REPLACE( 6149 concat(table_variants.INFO,''), 6150 ';*{annotation_fields_new_name}=[^;]*', 6151 '' 6152 ) 6153 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6154 """ 6155 annotation_fields_new_name_info_msg = " [update]" 6156 query_dict_remove[ 6157 f"remove 'INFO/{annotation_fields_new_name}'" 6158 ] = query 6159 6160 # Sep between fields in INFO 6161 nb_annotation_field += 1 6162 if nb_annotation_field > 1: 6163 annotation_field_sep = ";" 6164 else: 6165 annotation_field_sep = "" 6166 6167 log.info( 6168 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6169 ) 6170 6171 # Add INFO field to header 6172 parquet_hdr_vcf_header_infos_number = ( 6173 parquet_hdr_vcf_header_infos[annotation_field].num 6174 or "." 6175 ) 6176 parquet_hdr_vcf_header_infos_type = ( 6177 parquet_hdr_vcf_header_infos[annotation_field].type 6178 or "String" 6179 ) 6180 parquet_hdr_vcf_header_infos_description = ( 6181 parquet_hdr_vcf_header_infos[annotation_field].desc 6182 or f"{annotation_field} description" 6183 ) 6184 parquet_hdr_vcf_header_infos_source = ( 6185 parquet_hdr_vcf_header_infos[annotation_field].source 6186 or "unknown" 6187 ) 6188 parquet_hdr_vcf_header_infos_version = ( 6189 parquet_hdr_vcf_header_infos[annotation_field].version 6190 or "unknown" 6191 ) 6192 6193 vcf_reader.infos[annotation_fields_new_name] = ( 6194 vcf.parser._Info( 6195 annotation_fields_new_name, 6196 parquet_hdr_vcf_header_infos_number, 6197 parquet_hdr_vcf_header_infos_type, 6198 parquet_hdr_vcf_header_infos_description, 6199 parquet_hdr_vcf_header_infos_source, 6200 parquet_hdr_vcf_header_infos_version, 6201 self.code_type_map[ 6202 parquet_hdr_vcf_header_infos_type 6203 ], 6204 ) 6205 ) 6206 6207 # Append 6208 if force_append_annotation: 6209 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6210 else: 6211 query_case_when_append = "" 6212 6213 # Annotation/Update query fields 6214 # Found in INFO column 6215 if ( 6216 annotation_field_column == "INFO" 6217 and "INFO" in parquet_hdr_vcf_header_columns 6218 ): 6219 sql_query_annotation_update_info_sets.append( 6220 f""" 6221 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6222 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6223 ELSE '' 6224 END 6225 """ 6226 ) 6227 # Found in a specific column 6228 else: 6229 sql_query_annotation_update_info_sets.append( 6230 f""" 6231 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6232 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6233 ELSE '' 6234 END 6235 """ 6236 ) 6237 sql_query_annotation_to_agregate.append( 6238 f""" string_agg(table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6239 ) 6240 6241 # Not to annotate 6242 else: 6243 6244 if force_update_annotation: 6245 annotation_message = "forced" 6246 else: 6247 annotation_message = "skipped" 6248 6249 if annotation_field not in parquet_hdr_vcf_header_infos: 6250 log.warning( 6251 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6252 ) 6253 if annotation_fields_new_name in self.get_header().infos: 6254 log.warning( 6255 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6256 ) 6257 6258 # Check if ALL fields have to be annotated. Thus concat all INFO field 6259 # allow_annotation_full_info = True 6260 allow_annotation_full_info = not force_append_annotation 6261 6262 if parquet_type in ["regions"]: 6263 allow_annotation_full_info = False 6264 6265 if ( 6266 allow_annotation_full_info 6267 and nb_annotation_field == len(annotation_fields) 6268 and annotation_fields_all 6269 and ( 6270 "INFO" in parquet_hdr_vcf_header_columns 6271 and "INFO" in database.get_extra_columns() 6272 ) 6273 ): 6274 log.debug("Column INFO annotation enabled") 6275 sql_query_annotation_update_info_sets = [] 6276 sql_query_annotation_update_info_sets.append( 6277 f" table_parquet.INFO " 6278 ) 6279 6280 if sql_query_annotation_update_info_sets: 6281 6282 # Annotate 6283 log.info(f"Annotation '{annotation_name}' - Annotation...") 6284 6285 # Join query annotation update info sets for SQL 6286 sql_query_annotation_update_info_sets_sql = ",".join( 6287 sql_query_annotation_update_info_sets 6288 ) 6289 6290 # Check chromosomes list (and variants infos) 6291 sql_query_chromosomes = f""" 6292 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6293 FROM {table_variants} as table_variants 6294 GROUP BY table_variants."#CHROM" 6295 ORDER BY table_variants."#CHROM" 6296 """ 6297 sql_query_chromosomes_df = self.conn.execute( 6298 sql_query_chromosomes 6299 ).df() 6300 sql_query_chromosomes_dict = { 6301 entry["CHROM"]: { 6302 "count": entry["count_variants"], 6303 "min": entry["min_variants"], 6304 "max": entry["max_variants"], 6305 } 6306 for index, entry in sql_query_chromosomes_df.iterrows() 6307 } 6308 6309 # Init 6310 nb_of_query = 0 6311 nb_of_variant_annotated = 0 6312 query_dict = query_dict_remove 6313 6314 # for chrom in sql_query_chromosomes_df["CHROM"]: 6315 for chrom in sql_query_chromosomes_dict: 6316 6317 # Number of variant by chromosome 6318 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6319 chrom, {} 6320 ).get("count", 0) 6321 6322 log.debug( 6323 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6324 ) 6325 6326 # Annotation with regions database 6327 if parquet_type in ["regions"]: 6328 sql_query_annotation_from_clause = f""" 6329 FROM ( 6330 SELECT 6331 '{chrom}' AS \"#CHROM\", 6332 table_variants_from.\"POS\" AS \"POS\", 6333 {",".join(sql_query_annotation_to_agregate)} 6334 FROM {table_variants} as table_variants_from 6335 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6336 table_parquet_from."#CHROM" = '{chrom}' 6337 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6338 AND table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6339 ) 6340 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6341 GROUP BY table_variants_from.\"POS\" 6342 ) 6343 as table_parquet 6344 """ 6345 6346 sql_query_annotation_where_clause = """ 6347 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6348 AND table_parquet.\"POS\" = table_variants.\"POS\" 6349 """ 6350 6351 # Annotation with variants database 6352 else: 6353 sql_query_annotation_from_clause = f""" 6354 FROM {parquet_file_link} as table_parquet 6355 """ 6356 sql_query_annotation_where_clause = f""" 6357 table_variants."#CHROM" = '{chrom}' 6358 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6359 AND table_parquet.\"POS\" = table_variants.\"POS\" 6360 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6361 AND table_parquet.\"REF\" = table_variants.\"REF\" 6362 """ 6363 6364 # Create update query 6365 sql_query_annotation_chrom_interval_pos = f""" 6366 UPDATE {table_variants} as table_variants 6367 SET INFO = 6368 concat( 6369 CASE WHEN table_variants.INFO NOT IN ('','.') 6370 THEN table_variants.INFO 6371 ELSE '' 6372 END 6373 , 6374 CASE WHEN table_variants.INFO NOT IN ('','.') 6375 AND ( 6376 concat({sql_query_annotation_update_info_sets_sql}) 6377 ) 6378 NOT IN ('','.') 6379 THEN ';' 6380 ELSE '' 6381 END 6382 , 6383 {sql_query_annotation_update_info_sets_sql} 6384 ) 6385 {sql_query_annotation_from_clause} 6386 WHERE {sql_query_annotation_where_clause} 6387 ; 6388 """ 6389 6390 # Add update query to dict 6391 query_dict[ 6392 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6393 ] = sql_query_annotation_chrom_interval_pos 6394 6395 nb_of_query = len(query_dict) 6396 num_query = 0 6397 6398 # SET max_expression_depth TO x 6399 self.conn.execute("SET max_expression_depth TO 10000") 6400 6401 for query_name in query_dict: 6402 query = query_dict[query_name] 6403 num_query += 1 6404 log.info( 6405 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6406 ) 6407 result = self.conn.execute(query) 6408 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6409 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6410 log.info( 6411 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6412 ) 6413 6414 log.info( 6415 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6416 ) 6417 6418 else: 6419 6420 log.info( 6421 f"Annotation '{annotation_name}' - No Annotations available" 6422 ) 6423 6424 log.debug("Final header: " + str(vcf_reader.infos)) 6425 6426 # Remove added columns 6427 for added_column in added_columns: 6428 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6430 def annotation_splice(self, threads: int = None) -> None: 6431 """ 6432 This function annotate with snpEff 6433 6434 :param threads: The number of threads to use 6435 :return: the value of the variable "return_value". 6436 """ 6437 6438 # DEBUG 6439 log.debug("Start annotation with splice tools") 6440 6441 # Threads 6442 if not threads: 6443 threads = self.get_threads() 6444 log.debug("Threads: " + str(threads)) 6445 6446 # DEBUG 6447 delete_tmp = True 6448 if self.get_config().get("verbosity", "warning") in ["debug"]: 6449 delete_tmp = False 6450 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6451 6452 # Config 6453 config = self.get_config() 6454 log.debug("Config: " + str(config)) 6455 splice_config = config.get("tools", {}).get("splice", {}) 6456 if not splice_config: 6457 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6458 msg_err = "No Splice tool config" 6459 raise ValueError(msg_err) 6460 log.debug(f"splice_config: {splice_config}") 6461 6462 # Config - Folders - Databases 6463 databases_folders = ( 6464 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6465 ) 6466 log.debug("Databases annotations: " + str(databases_folders)) 6467 6468 # Splice docker image 6469 splice_docker_image = splice_config.get("docker").get("image") 6470 6471 # Pull splice image if it's not already there 6472 if not check_docker_image_exists(splice_docker_image): 6473 log.warning( 6474 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6475 ) 6476 try: 6477 command(f"docker pull {splice_config.get('docker').get('image')}") 6478 except subprocess.CalledProcessError: 6479 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6480 log.error(msg_err) 6481 raise ValueError(msg_err) 6482 6483 # Config - splice databases 6484 splice_databases = ( 6485 config.get("folders", {}) 6486 .get("databases", {}) 6487 .get("splice", DEFAULT_SPLICE_FOLDER) 6488 ) 6489 splice_databases = full_path(splice_databases) 6490 6491 # Param 6492 param = self.get_param() 6493 log.debug("Param: " + str(param)) 6494 6495 # Param 6496 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6497 log.debug("Options: " + str(options)) 6498 6499 # Data 6500 table_variants = self.get_table_variants() 6501 6502 # Check if not empty 6503 log.debug("Check if not empty") 6504 sql_query_chromosomes = ( 6505 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6506 ) 6507 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6508 log.info("VCF empty") 6509 return None 6510 6511 # Export in VCF 6512 log.debug("Create initial file to annotate") 6513 6514 # Create output folder / work folder 6515 if options.get("output_folder", ""): 6516 output_folder = options.get("output_folder", "") 6517 if not os.path.exists(output_folder): 6518 Path(output_folder).mkdir(parents=True, exist_ok=True) 6519 else: 6520 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6521 if not os.path.exists(output_folder): 6522 Path(output_folder).mkdir(parents=True, exist_ok=True) 6523 6524 if options.get("workdir", ""): 6525 workdir = options.get("workdir", "") 6526 else: 6527 workdir = "/work" 6528 6529 # Create tmp VCF file 6530 tmp_vcf = NamedTemporaryFile( 6531 prefix=self.get_prefix(), 6532 dir=output_folder, 6533 suffix=".vcf", 6534 delete=False, 6535 ) 6536 tmp_vcf_name = tmp_vcf.name 6537 6538 # VCF header 6539 header = self.get_header() 6540 6541 # Existing annotations 6542 for vcf_annotation in self.get_header().infos: 6543 6544 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6545 log.debug( 6546 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6547 ) 6548 6549 # Memory limit 6550 if config.get("memory", None): 6551 memory_limit = config.get("memory", "8G").upper() 6552 # upper() 6553 else: 6554 memory_limit = "8G" 6555 log.debug(f"memory_limit: {memory_limit}") 6556 6557 # Check number of variants to annotate 6558 where_clause_regex_spliceai = r"SpliceAI_\w+" 6559 where_clause_regex_spip = r"SPiP_\w+" 6560 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6561 df_list_of_variants_to_annotate = self.get_query_to_df( 6562 query=f""" SELECT * FROM variants {where_clause} """ 6563 ) 6564 if len(df_list_of_variants_to_annotate) == 0: 6565 log.warning( 6566 f"No variants to annotate with splice. Variants probably already annotated with splice" 6567 ) 6568 return None 6569 else: 6570 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6571 6572 # Export VCF file 6573 self.export_variant_vcf( 6574 vcf_file=tmp_vcf_name, 6575 remove_info=True, 6576 add_samples=True, 6577 index=False, 6578 where_clause=where_clause, 6579 ) 6580 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6581 if any(value for value in splice_config.values() if value is None): 6582 log.warning("At least one splice config parameter is empty") 6583 # exit annotation_splice 6584 return None 6585 6586 # Params in splice nf 6587 def check_values(dico: dict): 6588 """ 6589 Ensure parameters for NF splice pipeline 6590 """ 6591 for key, val in dico.items(): 6592 if key == "genome": 6593 if any( 6594 assemb in options.get("genome", {}) 6595 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6596 ): 6597 yield f"--{key} hg19" 6598 elif any( 6599 assemb in options.get("genome", {}) 6600 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6601 ): 6602 yield f"--{key} hg38" 6603 elif ( 6604 (isinstance(val, str) and val) 6605 or isinstance(val, int) 6606 or isinstance(val, bool) 6607 ): 6608 yield f"--{key} {val}" 6609 6610 # Genome 6611 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6612 options["genome"] = genome 6613 # NF params 6614 nf_params = [] 6615 # Add options 6616 if options: 6617 log.debug(options) 6618 nf_params = list(check_values(options)) 6619 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6620 else: 6621 log.debug("No NF params provided") 6622 # Add threads 6623 if "threads" not in options.keys(): 6624 nf_params.append(f"--threads {threads}") 6625 # Genome path 6626 genome_path = find_genome( 6627 config.get("folders", {}) 6628 .get("databases", {}) 6629 .get("genomes", DEFAULT_GENOME_FOLDER), 6630 file=f"{genome}.fa", 6631 ) 6632 # Add genome path 6633 if not genome_path: 6634 raise ValueError( 6635 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6636 ) 6637 else: 6638 log.debug(f"Genome: {genome_path}") 6639 nf_params.append(f"--genome_path {genome_path}") 6640 6641 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6642 """ 6643 Setting up updated databases for SPiP and SpliceAI 6644 """ 6645 6646 try: 6647 6648 # SpliceAI assembly transcriptome 6649 spliceai_assembly = os.path.join( 6650 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6651 options.get("genome"), 6652 "transcriptome", 6653 ) 6654 spip_assembly = options.get("genome") 6655 6656 spip = find( 6657 f"transcriptome_{spip_assembly}.RData", 6658 config.get("folders", {}).get("databases", {}).get("spip", {}), 6659 ) 6660 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6661 log.debug(f"SPiP annotations: {spip}") 6662 log.debug(f"SpliceAI annotations: {spliceai}") 6663 if spip and spliceai: 6664 return [ 6665 f"--spip_transcriptome {spip}", 6666 f"--spliceai_transcriptome {spliceai}", 6667 ] 6668 else: 6669 log.warning( 6670 "Can't find splice databases in configuration, use annotations file from image" 6671 ) 6672 except TypeError: 6673 log.warning( 6674 "Can't find splice databases in configuration, use annotations file from image" 6675 ) 6676 return [] 6677 6678 # Add options, check if transcriptome option have already beend provided 6679 if ( 6680 "spip_transcriptome" not in nf_params 6681 and "spliceai_transcriptome" not in nf_params 6682 ): 6683 splice_reference = splice_annotations(options, config) 6684 if splice_reference: 6685 nf_params.extend(splice_reference) 6686 # nf_params.append(f"--output_folder {output_folder}") 6687 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6688 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6689 log.debug(cmd) 6690 splice_config["docker"]["command"] = cmd 6691 6692 # Ensure proxy is set 6693 proxy = [ 6694 f"-e {var}={os.getenv(var)}" 6695 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6696 if os.getenv(var) is not None 6697 ] 6698 docker_cmd = get_bin_command( 6699 tool="splice", 6700 bin_type="docker", 6701 config=config, 6702 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6703 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6704 ) 6705 # print(docker_cmd) 6706 # exit() 6707 # Docker debug 6708 # if splice_config.get("rm_container"): 6709 # rm_container = "--rm" 6710 # else: 6711 # rm_container = "" 6712 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6713 log.debug(docker_cmd) 6714 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6715 log.debug(res.stdout) 6716 if res.stderr: 6717 log.error(res.stderr) 6718 res.check_returncode() 6719 # Update variants 6720 log.info("Annotation - Updating...") 6721 # Test find output vcf 6722 log.debug( 6723 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6724 ) 6725 output_vcf = [] 6726 # Wrong folder to look in 6727 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6728 if ( 6729 files 6730 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6731 ): 6732 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6733 # log.debug(os.listdir(options.get("output_folder"))) 6734 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6735 if not output_vcf: 6736 log.debug( 6737 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6738 ) 6739 else: 6740 # Get new header from annotated vcf 6741 log.debug(f"Initial header: {len(header.infos)} fields") 6742 # Create new header with splice infos 6743 new_vcf = Variants(input=output_vcf[0]) 6744 new_vcf_header = new_vcf.get_header().infos 6745 for keys, infos in new_vcf_header.items(): 6746 if keys not in header.infos.keys(): 6747 header.infos[keys] = infos 6748 log.debug(f"New header: {len(header.infos)} fields") 6749 log.debug(f"Splice tmp output: {output_vcf[0]}") 6750 self.update_from_vcf(output_vcf[0]) 6751 6752 # Remove file 6753 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6759 def get_config_default(self, name: str) -> dict: 6760 """ 6761 The function `get_config_default` returns a dictionary containing default configurations for 6762 various calculations and prioritizations. 6763 6764 :param name: The `get_config_default` function returns a dictionary containing default 6765 configurations for different calculations and prioritizations. The `name` parameter is used to 6766 specify which specific configuration to retrieve from the dictionary 6767 :type name: str 6768 :return: The function `get_config_default` returns a dictionary containing default configuration 6769 settings for different calculations and prioritizations. The specific configuration settings are 6770 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6771 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6772 returned. If there is no match, an empty dictionary is returned. 6773 """ 6774 6775 config_default = { 6776 "calculations": { 6777 "variant_chr_pos_alt_ref": { 6778 "type": "sql", 6779 "name": "variant_chr_pos_alt_ref", 6780 "description": "Create a variant ID with chromosome, position, alt and ref", 6781 "available": False, 6782 "output_column_name": "variant_chr_pos_alt_ref", 6783 "output_column_type": "String", 6784 "output_column_description": "variant ID with chromosome, position, alt and ref", 6785 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6786 "operation_info": True, 6787 }, 6788 "VARTYPE": { 6789 "type": "sql", 6790 "name": "VARTYPE", 6791 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6792 "available": True, 6793 "table": "variants", 6794 "output_column_name": "VARTYPE", 6795 "output_column_type": "String", 6796 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6797 "operation_query": """ 6798 CASE 6799 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6800 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6801 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6802 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6803 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6804 ELSE 'UNDEFINED' 6805 END 6806 """, 6807 "info_fields": ["SVTYPE"], 6808 "operation_info": True, 6809 }, 6810 "snpeff_hgvs": { 6811 "type": "python", 6812 "name": "snpeff_hgvs", 6813 "description": "HGVS nomenclatures from snpEff annotation", 6814 "available": True, 6815 "function_name": "calculation_extract_snpeff_hgvs", 6816 "function_params": ["snpeff_hgvs", "ANN"], 6817 }, 6818 "snpeff_ann_explode": { 6819 "type": "python", 6820 "name": "snpeff_ann_explode", 6821 "description": "Explode snpEff annotations with uniquify values", 6822 "available": True, 6823 "function_name": "calculation_snpeff_ann_explode", 6824 "function_params": [False, "fields", "snpeff_", "ANN"], 6825 }, 6826 "snpeff_ann_explode_uniquify": { 6827 "type": "python", 6828 "name": "snpeff_ann_explode_uniquify", 6829 "description": "Explode snpEff annotations", 6830 "available": True, 6831 "function_name": "calculation_snpeff_ann_explode", 6832 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6833 }, 6834 "snpeff_ann_explode_json": { 6835 "type": "python", 6836 "name": "snpeff_ann_explode_json", 6837 "description": "Explode snpEff annotations in JSON format", 6838 "available": True, 6839 "function_name": "calculation_snpeff_ann_explode", 6840 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6841 }, 6842 "NOMEN": { 6843 "type": "python", 6844 "name": "NOMEN", 6845 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6846 "available": True, 6847 "function_name": "calculation_extract_nomen", 6848 "function_params": [], 6849 }, 6850 "RENAME_INFO_FIELDS": { 6851 "type": "python", 6852 "name": "RENAME_INFO_FIELDS", 6853 "description": "Rename or remove INFO/tags", 6854 "available": True, 6855 "function_name": "calculation_rename_info_fields", 6856 "function_params": [], 6857 }, 6858 "FINDBYPIPELINE": { 6859 "type": "python", 6860 "name": "FINDBYPIPELINE", 6861 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6862 "available": True, 6863 "function_name": "calculation_find_by_pipeline", 6864 "function_params": ["findbypipeline"], 6865 }, 6866 "FINDBYSAMPLE": { 6867 "type": "python", 6868 "name": "FINDBYSAMPLE", 6869 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6870 "available": True, 6871 "function_name": "calculation_find_by_pipeline", 6872 "function_params": ["findbysample"], 6873 }, 6874 "GENOTYPECONCORDANCE": { 6875 "type": "python", 6876 "name": "GENOTYPECONCORDANCE", 6877 "description": "Concordance of genotype for multi caller VCF", 6878 "available": True, 6879 "function_name": "calculation_genotype_concordance", 6880 "function_params": [], 6881 }, 6882 "BARCODE": { 6883 "type": "python", 6884 "name": "BARCODE", 6885 "description": "BARCODE as VaRank tool", 6886 "available": True, 6887 "function_name": "calculation_barcode", 6888 "function_params": [], 6889 }, 6890 "BARCODEFAMILY": { 6891 "type": "python", 6892 "name": "BARCODEFAMILY", 6893 "description": "BARCODEFAMILY as VaRank tool", 6894 "available": True, 6895 "function_name": "calculation_barcode_family", 6896 "function_params": ["BCF"], 6897 }, 6898 "TRIO": { 6899 "type": "python", 6900 "name": "TRIO", 6901 "description": "Inheritance for a trio family", 6902 "available": True, 6903 "function_name": "calculation_trio", 6904 "function_params": [], 6905 }, 6906 "VAF": { 6907 "type": "python", 6908 "name": "VAF", 6909 "description": "Variant Allele Frequency (VAF) harmonization", 6910 "available": True, 6911 "function_name": "calculation_vaf_normalization", 6912 "function_params": [], 6913 }, 6914 "VAF_stats": { 6915 "type": "python", 6916 "name": "VAF_stats", 6917 "description": "Variant Allele Frequency (VAF) statistics", 6918 "available": True, 6919 "function_name": "calculation_genotype_stats", 6920 "function_params": ["VAF"], 6921 }, 6922 "DP_stats": { 6923 "type": "python", 6924 "name": "DP_stats", 6925 "description": "Depth (DP) statistics", 6926 "available": True, 6927 "function_name": "calculation_genotype_stats", 6928 "function_params": ["DP"], 6929 }, 6930 "variant_id": { 6931 "type": "python", 6932 "name": "variant_id", 6933 "description": "Variant ID generated from variant position and type", 6934 "available": True, 6935 "function_name": "calculation_variant_id", 6936 "function_params": [], 6937 }, 6938 "transcripts_json": { 6939 "type": "python", 6940 "name": "transcripts_json", 6941 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6942 "available": True, 6943 "function_name": "calculation_transcripts_annotation", 6944 "function_params": ["transcripts_json", None], 6945 }, 6946 "transcripts_ann": { 6947 "type": "python", 6948 "name": "transcripts_ann", 6949 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6950 "available": True, 6951 "function_name": "calculation_transcripts_annotation", 6952 "function_params": [None, "transcripts_ann"], 6953 }, 6954 "transcripts_annotations": { 6955 "type": "python", 6956 "name": "transcripts_annotations", 6957 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6958 "available": True, 6959 "function_name": "calculation_transcripts_annotation", 6960 "function_params": [None, None], 6961 }, 6962 "transcripts_prioritization": { 6963 "type": "python", 6964 "name": "transcripts_prioritization", 6965 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6966 "available": True, 6967 "function_name": "calculation_transcripts_prioritization", 6968 "function_params": [], 6969 }, 6970 "transcripts_export": { 6971 "type": "python", 6972 "name": "transcripts_export", 6973 "description": "Export transcripts table/view as a file (using param.json)", 6974 "available": True, 6975 "function_name": "calculation_transcripts_export", 6976 "function_params": [], 6977 }, 6978 }, 6979 "prioritizations": { 6980 "default": { 6981 "ANN2": [ 6982 { 6983 "type": "contains", 6984 "value": "HIGH", 6985 "score": 5, 6986 "flag": "PASS", 6987 "comment": [ 6988 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6989 ], 6990 }, 6991 { 6992 "type": "contains", 6993 "value": "MODERATE", 6994 "score": 3, 6995 "flag": "PASS", 6996 "comment": [ 6997 "A non-disruptive variant that might change protein effectiveness" 6998 ], 6999 }, 7000 { 7001 "type": "contains", 7002 "value": "LOW", 7003 "score": 0, 7004 "flag": "FILTERED", 7005 "comment": [ 7006 "Assumed to be mostly harmless or unlikely to change protein behavior" 7007 ], 7008 }, 7009 { 7010 "type": "contains", 7011 "value": "MODIFIER", 7012 "score": 0, 7013 "flag": "FILTERED", 7014 "comment": [ 7015 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 7016 ], 7017 }, 7018 ], 7019 } 7020 }, 7021 } 7022 7023 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
7025 def get_config_json( 7026 self, name: str, config_dict: dict = {}, config_file: str = None 7027 ) -> dict: 7028 """ 7029 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 7030 default values, a dictionary, and a file. 7031 7032 :param name: The `name` parameter in the `get_config_json` function is a string that represents 7033 the name of the configuration. It is used to identify and retrieve the configuration settings 7034 for a specific component or module 7035 :type name: str 7036 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 7037 dictionary that allows you to provide additional configuration settings or overrides. When you 7038 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 7039 the key is the configuration setting you want to override or 7040 :type config_dict: dict 7041 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 7042 specify the path to a configuration file that contains additional settings. If provided, the 7043 function will read the contents of this file and update the configuration dictionary with the 7044 values found in the file, overriding any existing values with the 7045 :type config_file: str 7046 :return: The function `get_config_json` returns a dictionary containing the configuration 7047 settings. 7048 """ 7049 7050 # Create with default prioritizations 7051 config_default = self.get_config_default(name=name) 7052 configuration = config_default 7053 # log.debug(f"configuration={configuration}") 7054 7055 # Replace prioritizations from dict 7056 for config in config_dict: 7057 configuration[config] = config_dict[config] 7058 7059 # Replace prioritizations from file 7060 config_file = full_path(config_file) 7061 if config_file: 7062 if os.path.exists(config_file): 7063 with open(config_file) as config_file_content: 7064 config_file_dict = yaml.safe_load(config_file_content) 7065 for config in config_file_dict: 7066 configuration[config] = config_file_dict[config] 7067 else: 7068 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 7069 log.error(msg_error) 7070 raise ValueError(msg_error) 7071 7072 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
7074 def prioritization( 7075 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7076 ) -> bool: 7077 """ 7078 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7079 prioritizes variants based on configured profiles and criteria. 7080 7081 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7082 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7083 a table name is provided, the method will prioritize the variants in that specific table 7084 :type table: str 7085 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7086 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7087 provided, the code will use a default prefix value of "PZ" 7088 :type pz_prefix: str 7089 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7090 additional parameters specific to the prioritization process. These parameters can include 7091 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7092 configurations needed for the prioritization of variants in a V 7093 :type pz_param: dict 7094 :return: A boolean value (True) is being returned from the `prioritization` function. 7095 """ 7096 7097 # Config 7098 config = self.get_config() 7099 7100 # Param 7101 param = self.get_param() 7102 7103 # Prioritization param 7104 if pz_param is not None: 7105 prioritization_param = pz_param 7106 else: 7107 prioritization_param = param.get("prioritization", {}) 7108 7109 # Configuration profiles 7110 prioritization_config_file = prioritization_param.get( 7111 "prioritization_config", None 7112 ) 7113 prioritization_config_file = full_path(prioritization_config_file) 7114 prioritizations_config = self.get_config_json( 7115 name="prioritizations", config_file=prioritization_config_file 7116 ) 7117 7118 # Prioritization prefix 7119 pz_prefix_default = "PZ" 7120 if pz_prefix is None: 7121 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7122 7123 # Prioritization options 7124 profiles = prioritization_param.get("profiles", []) 7125 if isinstance(profiles, str): 7126 profiles = profiles.split(",") 7127 pzfields = prioritization_param.get( 7128 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7129 ) 7130 if isinstance(pzfields, str): 7131 pzfields = pzfields.split(",") 7132 default_profile = prioritization_param.get("default_profile", None) 7133 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7134 prioritization_score_mode = prioritization_param.get( 7135 "prioritization_score_mode", "HOWARD" 7136 ) 7137 7138 # Quick Prioritizations 7139 prioritizations = param.get("prioritizations", None) 7140 if prioritizations: 7141 log.info("Quick Prioritization:") 7142 for profile in prioritizations.split(","): 7143 if profile not in profiles: 7144 profiles.append(profile) 7145 log.info(f" {profile}") 7146 7147 # If profile "ALL" provided, all profiles in the config profiles 7148 if "ALL" in profiles: 7149 profiles = list(prioritizations_config.keys()) 7150 7151 for profile in profiles: 7152 if prioritizations_config.get(profile, None): 7153 log.debug(f"Profile '{profile}' configured") 7154 else: 7155 msg_error = f"Profile '{profile}' NOT configured" 7156 log.error(msg_error) 7157 raise ValueError(msg_error) 7158 7159 if profiles: 7160 log.info(f"Prioritization... ") 7161 else: 7162 log.debug(f"No profile defined") 7163 return False 7164 7165 if not default_profile and len(profiles): 7166 default_profile = profiles[0] 7167 7168 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7169 log.debug("Profiles to check: " + str(list(profiles))) 7170 7171 # Variables 7172 if table is not None: 7173 table_variants = table 7174 else: 7175 table_variants = self.get_table_variants(clause="update") 7176 log.debug(f"Table to prioritize: {table_variants}") 7177 7178 # Added columns 7179 added_columns = [] 7180 7181 # Create list of PZfields 7182 # List of PZFields 7183 list_of_pzfields_original = pzfields + [ 7184 pzfield + pzfields_sep + profile 7185 for pzfield in pzfields 7186 for profile in profiles 7187 ] 7188 list_of_pzfields = [] 7189 log.debug(f"{list_of_pzfields_original}") 7190 7191 # Remove existing PZfields to use if exists 7192 for pzfield in list_of_pzfields_original: 7193 if self.get_header().infos.get(pzfield, None) is None: 7194 list_of_pzfields.append(pzfield) 7195 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7196 else: 7197 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7198 7199 if list_of_pzfields: 7200 7201 # Explode Infos prefix 7202 explode_infos_prefix = self.get_explode_infos_prefix() 7203 7204 # PZfields tags description 7205 PZfields_INFOS = { 7206 f"{pz_prefix}Tags": { 7207 "ID": f"{pz_prefix}Tags", 7208 "Number": ".", 7209 "Type": "String", 7210 "Description": "Variant tags based on annotation criteria", 7211 }, 7212 f"{pz_prefix}Score": { 7213 "ID": f"{pz_prefix}Score", 7214 "Number": 1, 7215 "Type": "Integer", 7216 "Description": "Variant score based on annotation criteria", 7217 }, 7218 f"{pz_prefix}Flag": { 7219 "ID": f"{pz_prefix}Flag", 7220 "Number": 1, 7221 "Type": "String", 7222 "Description": "Variant flag based on annotation criteria", 7223 }, 7224 f"{pz_prefix}Comment": { 7225 "ID": f"{pz_prefix}Comment", 7226 "Number": ".", 7227 "Type": "String", 7228 "Description": "Variant comment based on annotation criteria", 7229 }, 7230 f"{pz_prefix}Infos": { 7231 "ID": f"{pz_prefix}Infos", 7232 "Number": ".", 7233 "Type": "String", 7234 "Description": "Variant infos based on annotation criteria", 7235 }, 7236 f"{pz_prefix}Class": { 7237 "ID": f"{pz_prefix}Class", 7238 "Number": ".", 7239 "Type": "String", 7240 "Description": "Variant class based on annotation criteria", 7241 }, 7242 } 7243 7244 # Create INFO fields if not exist 7245 for field in PZfields_INFOS: 7246 field_ID = PZfields_INFOS[field]["ID"] 7247 field_description = PZfields_INFOS[field]["Description"] 7248 if field_ID not in self.get_header().infos and field_ID in pzfields: 7249 field_description = ( 7250 PZfields_INFOS[field]["Description"] 7251 + f", profile {default_profile}" 7252 ) 7253 self.get_header().infos[field_ID] = vcf.parser._Info( 7254 field_ID, 7255 PZfields_INFOS[field]["Number"], 7256 PZfields_INFOS[field]["Type"], 7257 field_description, 7258 "unknown", 7259 "unknown", 7260 code_type_map[PZfields_INFOS[field]["Type"]], 7261 ) 7262 7263 # Create INFO fields if not exist for each profile 7264 for profile in prioritizations_config: 7265 if profile in profiles or profiles == []: 7266 for field in PZfields_INFOS: 7267 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7268 field_description = ( 7269 PZfields_INFOS[field]["Description"] 7270 + f", profile {profile}" 7271 ) 7272 if ( 7273 field_ID not in self.get_header().infos 7274 and field in pzfields 7275 ): 7276 self.get_header().infos[field_ID] = vcf.parser._Info( 7277 field_ID, 7278 PZfields_INFOS[field]["Number"], 7279 PZfields_INFOS[field]["Type"], 7280 field_description, 7281 "unknown", 7282 "unknown", 7283 code_type_map[PZfields_INFOS[field]["Type"]], 7284 ) 7285 7286 # Header 7287 for pzfield in list_of_pzfields: 7288 if re.match(f"{pz_prefix}Score.*", pzfield): 7289 added_column = self.add_column( 7290 table_name=table_variants, 7291 column_name=pzfield, 7292 column_type="INTEGER", 7293 default_value="0", 7294 ) 7295 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7296 added_column = self.add_column( 7297 table_name=table_variants, 7298 column_name=pzfield, 7299 column_type="BOOLEAN", 7300 default_value="1", 7301 ) 7302 elif re.match(f"{pz_prefix}Class.*", pzfield): 7303 added_column = self.add_column( 7304 table_name=table_variants, 7305 column_name=pzfield, 7306 column_type="VARCHAR[]", 7307 default_value="null", 7308 ) 7309 else: 7310 added_column = self.add_column( 7311 table_name=table_variants, 7312 column_name=pzfield, 7313 column_type="STRING", 7314 default_value="''", 7315 ) 7316 added_columns.append(added_column) 7317 7318 # Profiles 7319 if profiles: 7320 7321 # foreach profile in configuration file 7322 for profile in prioritizations_config: 7323 7324 # If profile is asked in param, or ALL are asked (empty profile []) 7325 if profile in profiles or profiles == []: 7326 log.info(f"Profile '{profile}'") 7327 7328 sql_set_info_option = "" 7329 7330 sql_set_info = [] 7331 7332 # PZ fields set 7333 7334 # PZScore 7335 if ( 7336 f"{pz_prefix}Score{pzfields_sep}{profile}" 7337 in list_of_pzfields 7338 ): 7339 sql_set_info.append( 7340 f""" 7341 concat( 7342 '{pz_prefix}Score{pzfields_sep}{profile}=', 7343 {pz_prefix}Score{pzfields_sep}{profile} 7344 ) 7345 """ 7346 ) 7347 if ( 7348 profile == default_profile 7349 and f"{pz_prefix}Score" in list_of_pzfields 7350 ): 7351 sql_set_info.append( 7352 f""" 7353 concat( 7354 '{pz_prefix}Score=', 7355 {pz_prefix}Score{pzfields_sep}{profile} 7356 ) 7357 """ 7358 ) 7359 7360 # PZFlag 7361 if ( 7362 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7363 in list_of_pzfields 7364 ): 7365 sql_set_info.append( 7366 f""" 7367 concat( 7368 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7369 CASE 7370 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7371 THEN 'PASS' 7372 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7373 THEN 'FILTERED' 7374 END 7375 ) 7376 """ 7377 ) 7378 if ( 7379 profile == default_profile 7380 and f"{pz_prefix}Flag" in list_of_pzfields 7381 ): 7382 sql_set_info.append( 7383 f""" 7384 concat( 7385 '{pz_prefix}Flag=', 7386 CASE 7387 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7388 THEN 'PASS' 7389 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7390 THEN 'FILTERED' 7391 END 7392 ) 7393 """ 7394 ) 7395 7396 # PZClass 7397 if ( 7398 f"{pz_prefix}Class{pzfields_sep}{profile}" 7399 in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 concat( 7404 '{pz_prefix}Class{pzfields_sep}{profile}=', 7405 CASE 7406 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7407 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7408 ELSE '.' 7409 END 7410 ) 7411 7412 """ 7413 ) 7414 if ( 7415 profile == default_profile 7416 and f"{pz_prefix}Class" in list_of_pzfields 7417 ): 7418 sql_set_info.append( 7419 f""" 7420 concat( 7421 '{pz_prefix}Class=', 7422 CASE 7423 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7424 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7425 ELSE '.' 7426 END 7427 ) 7428 """ 7429 ) 7430 7431 # PZComment 7432 if ( 7433 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7434 in list_of_pzfields 7435 ): 7436 sql_set_info.append( 7437 f""" 7438 CASE 7439 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7440 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7441 ELSE '' 7442 END 7443 """ 7444 ) 7445 if ( 7446 profile == default_profile 7447 and f"{pz_prefix}Comment" in list_of_pzfields 7448 ): 7449 sql_set_info.append( 7450 f""" 7451 CASE 7452 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7453 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7454 ELSE '' 7455 END 7456 """ 7457 ) 7458 7459 # PZInfos 7460 if ( 7461 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7462 in list_of_pzfields 7463 ): 7464 sql_set_info.append( 7465 f""" 7466 CASE 7467 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7468 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7469 ELSE '' 7470 END 7471 """ 7472 ) 7473 if ( 7474 profile == default_profile 7475 and f"{pz_prefix}Infos" in list_of_pzfields 7476 ): 7477 sql_set_info.append( 7478 f""" 7479 CASE 7480 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7481 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7482 ELSE '' 7483 END 7484 """ 7485 ) 7486 7487 # Merge PZfields 7488 sql_set_info_option = "" 7489 sql_set_sep = "" 7490 for sql_set in sql_set_info: 7491 if sql_set_sep: 7492 sql_set_info_option += f""" 7493 , concat('{sql_set_sep}', {sql_set}) 7494 """ 7495 else: 7496 sql_set_info_option += f""" 7497 , {sql_set} 7498 """ 7499 sql_set_sep = ";" 7500 7501 sql_queries = [] 7502 for annotation in prioritizations_config[profile]: 7503 7504 # skip special sections 7505 if annotation.startswith("_"): 7506 continue 7507 7508 # For each criterions 7509 for criterion in prioritizations_config[profile][ 7510 annotation 7511 ]: 7512 7513 # Criterion mode 7514 criterion_mode = None 7515 if np.any( 7516 np.isin(list(criterion.keys()), ["type", "value"]) 7517 ): 7518 criterion_mode = "operation" 7519 elif np.any( 7520 np.isin(list(criterion.keys()), ["sql", "fields"]) 7521 ): 7522 criterion_mode = "sql" 7523 log.debug(f"Criterion Mode: {criterion_mode}") 7524 7525 # Criterion parameters 7526 criterion_type = criterion.get("type", None) 7527 criterion_value = criterion.get("value", None) 7528 criterion_sql = criterion.get("sql", None) 7529 criterion_fields = criterion.get("fields", None) 7530 criterion_score = criterion.get("score", 0) 7531 criterion_flag = criterion.get("flag", "PASS") 7532 criterion_class = criterion.get("class", None) 7533 criterion_flag_bool = criterion_flag == "PASS" 7534 criterion_comment = ( 7535 ", ".join(criterion.get("comment", [])) 7536 .replace("'", "''") 7537 .replace(";", ",") 7538 .replace("\t", " ") 7539 ) 7540 criterion_infos = ( 7541 str(criterion) 7542 .replace("'", "''") 7543 .replace(";", ",") 7544 .replace("\t", " ") 7545 ) 7546 7547 # SQL 7548 if criterion_sql is not None and isinstance( 7549 criterion_sql, list 7550 ): 7551 criterion_sql = " ".join(criterion_sql) 7552 7553 # Fields and explode 7554 if criterion_fields is None: 7555 criterion_fields = [annotation] 7556 if not isinstance(criterion_fields, list): 7557 criterion_fields = str(criterion_fields).split(",") 7558 7559 # Class 7560 if criterion_class is not None and not isinstance( 7561 criterion_class, list 7562 ): 7563 criterion_class = str(criterion_class).split(",") 7564 7565 for annotation_field in criterion_fields: 7566 7567 # Explode specific annotation 7568 log.debug( 7569 f"Explode annotation '{annotation_field}'" 7570 ) 7571 added_columns += self.explode_infos( 7572 prefix=explode_infos_prefix, 7573 fields=[annotation_field], 7574 table=table_variants, 7575 ) 7576 extra_infos = self.get_extra_infos( 7577 table=table_variants 7578 ) 7579 7580 # Check if annotation field is present 7581 if ( 7582 f"{explode_infos_prefix}{annotation_field}" 7583 not in extra_infos 7584 ): 7585 msq_err = f"Annotation '{annotation_field}' not in data" 7586 log.error(msq_err) 7587 raise ValueError(msq_err) 7588 else: 7589 log.debug( 7590 f"Annotation '{annotation_field}' in data" 7591 ) 7592 7593 sql_set = [] 7594 sql_set_info = [] 7595 7596 # PZ fields set 7597 7598 # PZScore 7599 if ( 7600 f"{pz_prefix}Score{pzfields_sep}{profile}" 7601 in list_of_pzfields 7602 ): 7603 # VaRank prioritization score mode 7604 if prioritization_score_mode.upper().strip() in ["VARANK", "MAX", "MAXIMUM", "TOP"]: 7605 sql_set.append( 7606 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} ELSE {pz_prefix}Score{pzfields_sep}{profile} END " 7607 ) 7608 # default HOWARD prioritization score mode 7609 else: 7610 sql_set.append( 7611 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7612 ) 7613 7614 # PZFlag 7615 if ( 7616 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7617 in list_of_pzfields 7618 ): 7619 sql_set.append( 7620 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7621 ) 7622 7623 # PZClass 7624 if ( 7625 f"{pz_prefix}Class{pzfields_sep}{profile}" 7626 in list_of_pzfields 7627 and criterion_class is not None 7628 ): 7629 sql_set.append( 7630 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7631 ) 7632 7633 # PZComment 7634 if ( 7635 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7636 in list_of_pzfields 7637 ): 7638 sql_set.append( 7639 f""" 7640 {pz_prefix}Comment{pzfields_sep}{profile} = 7641 concat( 7642 {pz_prefix}Comment{pzfields_sep}{profile}, 7643 CASE 7644 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7645 THEN ', ' 7646 ELSE '' 7647 END, 7648 '{criterion_comment}' 7649 ) 7650 """ 7651 ) 7652 7653 # PZInfos 7654 if ( 7655 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7656 in list_of_pzfields 7657 ): 7658 sql_set.append( 7659 f""" 7660 {pz_prefix}Infos{pzfields_sep}{profile} = 7661 concat( 7662 {pz_prefix}Infos{pzfields_sep}{profile}, 7663 '{criterion_infos}' 7664 ) 7665 """ 7666 ) 7667 sql_set_option = ",".join(sql_set) 7668 7669 # Criterion and comparison 7670 if sql_set_option: 7671 7672 if criterion_mode in ["operation"]: 7673 7674 try: 7675 float(criterion_value) 7676 sql_update = f""" 7677 UPDATE {table_variants} 7678 SET {sql_set_option} 7679 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7680 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7681 """ 7682 except: 7683 contains_option = "" 7684 if criterion_type == "contains": 7685 contains_option = ".*" 7686 sql_update = f""" 7687 UPDATE {table_variants} 7688 SET {sql_set_option} 7689 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7690 """ 7691 sql_queries.append(sql_update) 7692 7693 elif criterion_mode in ["sql"]: 7694 7695 sql_update = f""" 7696 UPDATE {table_variants} 7697 SET {sql_set_option} 7698 WHERE {criterion_sql} 7699 """ 7700 sql_queries.append(sql_update) 7701 7702 else: 7703 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7704 log.error(msg_err) 7705 raise ValueError(msg_err) 7706 7707 else: 7708 log.warning( 7709 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7710 ) 7711 7712 # PZTags 7713 if ( 7714 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7715 in list_of_pzfields 7716 ): 7717 7718 # Create PZFalgs value 7719 pztags_value = "" 7720 pztags_sep_default = "," 7721 pztags_sep = "" 7722 for pzfield in pzfields: 7723 if pzfield not in [f"{pz_prefix}Tags"]: 7724 if ( 7725 f"{pzfield}{pzfields_sep}{profile}" 7726 in list_of_pzfields 7727 ): 7728 if pzfield in [f"{pz_prefix}Flag"]: 7729 pztags_value += f"""{pztags_sep}{pzfield}#', 7730 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7731 THEN 'PASS' 7732 ELSE 'FILTERED' 7733 END, '""" 7734 elif pzfield in [f"{pz_prefix}Class"]: 7735 pztags_value += f"""{pztags_sep}{pzfield}#', 7736 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7737 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7738 ELSE '.' 7739 END, '""" 7740 else: 7741 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7742 pztags_sep = pztags_sep_default 7743 7744 # Add Query update for PZFlags 7745 sql_update_pztags = f""" 7746 UPDATE {table_variants} 7747 SET INFO = concat( 7748 INFO, 7749 CASE WHEN INFO NOT in ('','.') 7750 THEN ';' 7751 ELSE '' 7752 END, 7753 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7754 ) 7755 """ 7756 sql_queries.append(sql_update_pztags) 7757 7758 # Add Query update for PZFlags for default 7759 if profile == default_profile: 7760 sql_update_pztags_default = f""" 7761 UPDATE {table_variants} 7762 SET INFO = concat( 7763 INFO, 7764 ';', 7765 '{pz_prefix}Tags={pztags_value}' 7766 ) 7767 """ 7768 sql_queries.append(sql_update_pztags_default) 7769 7770 log.info(f"""Profile '{profile}' - Prioritization... """) 7771 7772 if sql_queries: 7773 7774 for sql_query in sql_queries: 7775 log.debug( 7776 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7777 ) 7778 self.conn.execute(sql_query) 7779 7780 log.info(f"""Profile '{profile}' - Update... """) 7781 sql_query_update = f""" 7782 UPDATE {table_variants} 7783 SET INFO = 7784 concat( 7785 CASE 7786 WHEN INFO NOT IN ('','.') 7787 THEN concat(INFO, ';') 7788 ELSE '' 7789 END 7790 {sql_set_info_option} 7791 ) 7792 """ 7793 self.conn.execute(sql_query_update) 7794 7795 else: 7796 7797 log.warning(f"No profiles in parameters") 7798 7799 # Remove added columns 7800 for added_column in added_columns: 7801 self.drop_column(column=added_column) 7802 7803 # Explode INFOS fields into table fields 7804 if self.get_explode_infos(): 7805 self.explode_infos( 7806 prefix=self.get_explode_infos_prefix(), 7807 fields=self.get_explode_infos_fields(), 7808 force=True, 7809 ) 7810 7811 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7817 def annotation_hgvs(self, threads: int = None) -> None: 7818 """ 7819 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7820 coordinates and alleles. 7821 7822 :param threads: The `threads` parameter is an optional integer that specifies the number of 7823 threads to use for parallel processing. If no value is provided, it will default to the number 7824 of threads obtained from the `get_threads()` method 7825 :type threads: int 7826 """ 7827 7828 # Function for each partition of the Dask Dataframe 7829 def partition_function(partition): 7830 """ 7831 The function `partition_function` applies the `annotation_hgvs_partition` function to 7832 each row of a DataFrame called `partition`. 7833 7834 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7835 to be processed 7836 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7837 the "partition" dataframe along the axis 1. 7838 """ 7839 return partition.apply(annotation_hgvs_partition, axis=1) 7840 7841 def annotation_hgvs_partition(row) -> str: 7842 """ 7843 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7844 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7845 7846 :param row: A dictionary-like object that contains the values for the following keys: 7847 :return: a string that contains the HGVS names associated with the given row of data. 7848 """ 7849 7850 chr = row["CHROM"] 7851 pos = row["POS"] 7852 ref = row["REF"] 7853 alt = row["ALT"] 7854 7855 # Find list of associated transcripts 7856 transcripts_list = list( 7857 polars_conn.execute( 7858 f""" 7859 SELECT transcript 7860 FROM refseq_df 7861 WHERE CHROM='{chr}' 7862 AND POS={pos} 7863 """ 7864 )["transcript"] 7865 ) 7866 7867 # Full HGVS annotation in list 7868 hgvs_full_list = [] 7869 7870 for transcript_name in transcripts_list: 7871 7872 # Transcript 7873 transcript = get_transcript( 7874 transcripts=transcripts, transcript_name=transcript_name 7875 ) 7876 # Exon 7877 if use_exon: 7878 exon = transcript.find_exon_number(pos) 7879 else: 7880 exon = None 7881 # Protein 7882 transcript_protein = None 7883 if use_protein or add_protein or full_format: 7884 transcripts_protein = list( 7885 polars_conn.execute( 7886 f""" 7887 SELECT protein 7888 FROM refseqlink_df 7889 WHERE transcript='{transcript_name}' 7890 LIMIT 1 7891 """ 7892 )["protein"] 7893 ) 7894 if len(transcripts_protein): 7895 transcript_protein = transcripts_protein[0] 7896 7897 # HGVS name 7898 hgvs_name = format_hgvs_name( 7899 chr, 7900 pos, 7901 ref, 7902 alt, 7903 genome=genome, 7904 transcript=transcript, 7905 transcript_protein=transcript_protein, 7906 exon=exon, 7907 use_gene=use_gene, 7908 use_protein=use_protein, 7909 full_format=full_format, 7910 use_version=use_version, 7911 codon_type=codon_type, 7912 ) 7913 hgvs_full_list.append(hgvs_name) 7914 if add_protein and not use_protein and not full_format: 7915 hgvs_name = format_hgvs_name( 7916 chr, 7917 pos, 7918 ref, 7919 alt, 7920 genome=genome, 7921 transcript=transcript, 7922 transcript_protein=transcript_protein, 7923 exon=exon, 7924 use_gene=use_gene, 7925 use_protein=True, 7926 full_format=False, 7927 use_version=use_version, 7928 codon_type=codon_type, 7929 ) 7930 hgvs_full_list.append(hgvs_name) 7931 7932 # Create liste of HGVS annotations 7933 hgvs_full = ",".join(hgvs_full_list) 7934 7935 return hgvs_full 7936 7937 # Polars connexion 7938 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7939 7940 # Config 7941 config = self.get_config() 7942 7943 # Databases 7944 # Genome 7945 databases_genomes_folders = ( 7946 config.get("folders", {}) 7947 .get("databases", {}) 7948 .get("genomes", DEFAULT_GENOME_FOLDER) 7949 ) 7950 databases_genome = ( 7951 config.get("folders", {}).get("databases", {}).get("genomes", "") 7952 ) 7953 # refseq database folder 7954 databases_refseq_folders = ( 7955 config.get("folders", {}) 7956 .get("databases", {}) 7957 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7958 ) 7959 # refseq 7960 databases_refseq = config.get("databases", {}).get("refSeq", None) 7961 # refSeqLink 7962 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7963 7964 # Param 7965 param = self.get_param() 7966 7967 # Quick HGVS 7968 if "hgvs_options" in param and param.get("hgvs_options", ""): 7969 log.info(f"Quick HGVS Annotation:") 7970 if not param.get("hgvs", None): 7971 param["hgvs"] = {} 7972 for option in param.get("hgvs_options", "").split(","): 7973 option_var_val = option.split("=") 7974 option_var = option_var_val[0] 7975 if len(option_var_val) > 1: 7976 option_val = option_var_val[1] 7977 else: 7978 option_val = "True" 7979 if option_val.upper() in ["TRUE"]: 7980 option_val = True 7981 elif option_val.upper() in ["FALSE"]: 7982 option_val = False 7983 log.info(f" {option_var}={option_val}") 7984 param["hgvs"][option_var] = option_val 7985 7986 # Check if HGVS annotation enabled 7987 if "hgvs" in param: 7988 log.info(f"HGVS Annotation... ") 7989 for hgvs_option in param.get("hgvs", {}): 7990 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7991 else: 7992 return 7993 7994 # HGVS Param 7995 param_hgvs = param.get("hgvs", {}) 7996 use_exon = param_hgvs.get("use_exon", False) 7997 use_gene = param_hgvs.get("use_gene", False) 7998 use_protein = param_hgvs.get("use_protein", False) 7999 add_protein = param_hgvs.get("add_protein", False) 8000 full_format = param_hgvs.get("full_format", False) 8001 use_version = param_hgvs.get("use_version", False) 8002 codon_type = param_hgvs.get("codon_type", "3") 8003 8004 # refSseq refSeqLink 8005 databases_refseq = param_hgvs.get("refseq", databases_refseq) 8006 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 8007 8008 # Assembly 8009 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 8010 8011 # Genome 8012 genome_file = None 8013 if find_genome(databases_genome): 8014 genome_file = find_genome(databases_genome) 8015 else: 8016 genome_file = find_genome( 8017 genome_path=databases_genomes_folders, assembly=assembly 8018 ) 8019 log.debug("Genome: " + str(genome_file)) 8020 8021 # refSseq 8022 refseq_file = find_file_prefix( 8023 input_file=databases_refseq, 8024 prefix="ncbiRefSeq", 8025 folder=databases_refseq_folders, 8026 assembly=assembly, 8027 ) 8028 log.debug("refSeq: " + str(refseq_file)) 8029 8030 # refSeqLink 8031 refseqlink_file = find_file_prefix( 8032 input_file=databases_refseqlink, 8033 prefix="ncbiRefSeqLink", 8034 folder=databases_refseq_folders, 8035 assembly=assembly, 8036 ) 8037 log.debug("refSeqLink: " + str(refseqlink_file)) 8038 8039 # Threads 8040 if not threads: 8041 threads = self.get_threads() 8042 log.debug("Threads: " + str(threads)) 8043 8044 # Variables 8045 table_variants = self.get_table_variants(clause="update") 8046 8047 # Get variants SNV and InDel only 8048 query_variants = f""" 8049 SELECT "#CHROM" AS CHROM, POS, REF, ALT 8050 FROM {table_variants} 8051 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 8052 """ 8053 df_variants = self.get_query_to_df(query_variants) 8054 8055 # Added columns 8056 added_columns = [] 8057 8058 # Add hgvs column in variants table 8059 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 8060 added_column = self.add_column( 8061 table_variants, hgvs_column_name, "STRING", default_value=None 8062 ) 8063 added_columns.append(added_column) 8064 8065 log.debug(f"refSeq loading...") 8066 # refSeq in duckDB 8067 refseq_table = get_refseq_table( 8068 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 8069 ) 8070 # Loading all refSeq in Dataframe 8071 refseq_query = f""" 8072 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8073 FROM {refseq_table} 8074 JOIN df_variants ON ( 8075 {refseq_table}.chrom = df_variants.CHROM 8076 AND {refseq_table}.txStart<=df_variants.POS 8077 AND {refseq_table}.txEnd>=df_variants.POS 8078 ) 8079 """ 8080 refseq_df = self.conn.query(refseq_query).pl() 8081 8082 if refseqlink_file: 8083 log.debug(f"refSeqLink loading...") 8084 # refSeqLink in duckDB 8085 refseqlink_table = get_refseq_table( 8086 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8087 ) 8088 # Loading all refSeqLink in Dataframe 8089 protacc_column = "protAcc_with_ver" 8090 mrnaacc_column = "mrnaAcc_with_ver" 8091 refseqlink_query = f""" 8092 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8093 FROM {refseqlink_table} 8094 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8095 WHERE protAcc_without_ver IS NOT NULL 8096 """ 8097 # Polars Dataframe 8098 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8099 8100 # Read RefSeq transcripts into a python dict/model. 8101 log.debug(f"Transcripts loading...") 8102 with tempfile.TemporaryDirectory() as tmpdir: 8103 transcripts_query = f""" 8104 COPY ( 8105 SELECT {refseq_table}.* 8106 FROM {refseq_table} 8107 JOIN df_variants ON ( 8108 {refseq_table}.chrom=df_variants.CHROM 8109 AND {refseq_table}.txStart<=df_variants.POS 8110 AND {refseq_table}.txEnd>=df_variants.POS 8111 ) 8112 ) 8113 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8114 """ 8115 self.conn.query(transcripts_query) 8116 with open(f"{tmpdir}/transcript.tsv") as infile: 8117 transcripts = read_transcripts(infile) 8118 8119 # Polars connexion 8120 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8121 8122 log.debug("Genome loading...") 8123 # Read genome sequence using pyfaidx. 8124 genome = Fasta(genome_file) 8125 8126 log.debug("Start annotation HGVS...") 8127 8128 # Create 8129 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8130 ddf = dd.from_pandas(df_variants, npartitions=threads) 8131 8132 # Use dask.dataframe.apply() to apply function on each partition 8133 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8134 8135 # Convert Dask DataFrame to Pandas Dataframe 8136 df = ddf.compute() 8137 8138 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8139 with tempfile.TemporaryDirectory() as tmpdir: 8140 df_parquet = os.path.join(tmpdir, "df.parquet") 8141 df.to_parquet(df_parquet) 8142 8143 # Update hgvs column 8144 update_variant_query = f""" 8145 UPDATE {table_variants} 8146 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8147 FROM read_parquet('{df_parquet}') as df 8148 WHERE variants."#CHROM" = df.CHROM 8149 AND variants.POS = df.POS 8150 AND variants.REF = df.REF 8151 AND variants.ALT = df.ALT 8152 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8153 """ 8154 self.execute_query(update_variant_query) 8155 8156 # Update INFO column 8157 sql_query_update = f""" 8158 UPDATE {table_variants} 8159 SET INFO = 8160 concat( 8161 CASE 8162 WHEN INFO NOT IN ('','.') 8163 THEN concat(INFO, ';') 8164 ELSE '' 8165 END, 8166 'hgvs=', 8167 {hgvs_column_name} 8168 ) 8169 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8170 """ 8171 self.execute_query(sql_query_update) 8172 8173 # Add header 8174 HGVS_INFOS = { 8175 "hgvs": { 8176 "ID": "hgvs", 8177 "Number": ".", 8178 "Type": "String", 8179 "Description": f"HGVS annotatation with HOWARD", 8180 } 8181 } 8182 8183 for field in HGVS_INFOS: 8184 field_ID = HGVS_INFOS[field]["ID"] 8185 field_description = HGVS_INFOS[field]["Description"] 8186 self.get_header().infos[field_ID] = vcf.parser._Info( 8187 field_ID, 8188 HGVS_INFOS[field]["Number"], 8189 HGVS_INFOS[field]["Type"], 8190 field_description, 8191 "unknown", 8192 "unknown", 8193 code_type_map[HGVS_INFOS[field]["Type"]], 8194 ) 8195 8196 # Remove added columns 8197 for added_column in added_columns: 8198 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8204 def get_operations_help( 8205 self, operations_config_dict: dict = {}, operations_config_file: str = None 8206 ) -> list: 8207 8208 # Init 8209 operations_help = [] 8210 8211 # operations 8212 operations = self.get_config_json( 8213 name="calculations", 8214 config_dict=operations_config_dict, 8215 config_file=operations_config_file, 8216 ) 8217 for op in operations: 8218 op_name = operations[op].get("name", op).upper() 8219 op_description = operations[op].get("description", op_name) 8220 op_available = operations[op].get("available", False) 8221 if op_available: 8222 operations_help.append(f" {op_name}: {op_description}") 8223 8224 # Sort operations 8225 operations_help.sort() 8226 8227 # insert header 8228 operations_help.insert(0, "Available calculation operations:") 8229 8230 # Return 8231 return operations_help
8233 def calculation( 8234 self, 8235 operations: dict = {}, 8236 operations_config_dict: dict = {}, 8237 operations_config_file: str = None, 8238 ) -> None: 8239 """ 8240 It takes a list of operations, and for each operation, it checks if it's a python or sql 8241 operation, and then calls the appropriate function 8242 8243 param json example: 8244 "calculation": { 8245 "NOMEN": { 8246 "options": { 8247 "hgvs_field": "hgvs" 8248 }, 8249 "middle" : null 8250 } 8251 """ 8252 8253 # Param 8254 param = self.get_param() 8255 8256 # CHeck operations config file 8257 if operations_config_file is None: 8258 operations_config_file = param.get("calculation", {}).get( 8259 "calculation_config", None 8260 ) 8261 8262 # operations config 8263 operations_config = self.get_config_json( 8264 name="calculations", 8265 config_dict=operations_config_dict, 8266 config_file=operations_config_file, 8267 ) 8268 8269 # Upper keys 8270 operations_config = {k.upper(): v for k, v in operations_config.items()} 8271 8272 # Calculations 8273 8274 # Operations from param 8275 operations = param.get("calculation", {}).get("calculations", operations) 8276 8277 # Quick calculation - add 8278 if param.get("calculations", None): 8279 8280 # List of operations 8281 calculations_list = [ 8282 value.strip() for value in param.get("calculations", "").split(",") 8283 ] 8284 8285 # Log 8286 log.info(f"Quick Calculations:") 8287 for calculation_key in calculations_list: 8288 log.info(f" {calculation_key}") 8289 8290 # Create tmp operations (to keep operation order) 8291 operations_tmp = {} 8292 for calculation_operation in calculations_list: 8293 if calculation_operation.upper() not in operations_tmp: 8294 log.debug( 8295 f"{calculation_operation}.upper() not in {operations_tmp}" 8296 ) 8297 operations_tmp[calculation_operation.upper()] = {} 8298 add_value_into_dict( 8299 dict_tree=operations_tmp, 8300 sections=[ 8301 calculation_operation.upper(), 8302 ], 8303 value=operations.get(calculation_operation.upper(), {}), 8304 ) 8305 # Add operations already in param 8306 for calculation_operation in operations: 8307 if calculation_operation not in operations_tmp: 8308 operations_tmp[calculation_operation] = operations.get( 8309 calculation_operation, {} 8310 ) 8311 8312 # Update operations in param 8313 operations = operations_tmp 8314 8315 # Operations for calculation 8316 if not operations: 8317 operations = param.get("calculation", {}).get("calculations", {}) 8318 8319 if operations: 8320 log.info(f"Calculations...") 8321 8322 # For each operations 8323 for operation_name in operations: 8324 operation_name = operation_name.upper() 8325 if operation_name not in [""]: 8326 if operation_name in operations_config: 8327 log.info(f"Calculation '{operation_name}'") 8328 operation = operations_config[operation_name] 8329 operation_type = operation.get("type", "sql") 8330 if operation_type == "python": 8331 self.calculation_process_function( 8332 operation=operation, operation_name=operation_name 8333 ) 8334 elif operation_type == "sql": 8335 self.calculation_process_sql( 8336 operation=operation, operation_name=operation_name 8337 ) 8338 else: 8339 log.error( 8340 f"Operations config: Type '{operation_type}' NOT available" 8341 ) 8342 raise ValueError( 8343 f"Operations config: Type '{operation_type}' NOT available" 8344 ) 8345 else: 8346 log.error( 8347 f"Operations config: Calculation '{operation_name}' NOT available" 8348 ) 8349 raise ValueError( 8350 f"Operations config: Calculation '{operation_name}' NOT available" 8351 ) 8352 8353 # Explode INFOS fields into table fields 8354 if self.get_explode_infos(): 8355 self.explode_infos( 8356 prefix=self.get_explode_infos_prefix(), 8357 fields=self.get_explode_infos_fields(), 8358 force=True, 8359 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8361 def calculation_process_sql( 8362 self, operation: dict, operation_name: str = "unknown" 8363 ) -> None: 8364 """ 8365 The `calculation_process_sql` function takes in a mathematical operation as a string and 8366 performs the operation, updating the specified table with the result. 8367 8368 :param operation: The `operation` parameter is a dictionary that contains information about the 8369 mathematical operation to be performed. It includes the following keys: 8370 :type operation: dict 8371 :param operation_name: The `operation_name` parameter is a string that represents the name of 8372 the mathematical operation being performed. It is used for logging and error handling purposes, 8373 defaults to unknown 8374 :type operation_name: str (optional) 8375 """ 8376 8377 # Operation infos 8378 operation_name = operation.get("name", "unknown") 8379 log.debug(f"process SQL {operation_name}") 8380 output_column_name = operation.get("output_column_name", operation_name) 8381 output_column_type = operation.get("output_column_type", "String") 8382 prefix = operation.get("explode_infos_prefix", "") 8383 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8384 output_column_description = operation.get( 8385 "output_column_description", f"{operation_name} operation" 8386 ) 8387 operation_query = operation.get("operation_query", None) 8388 if isinstance(operation_query, list): 8389 operation_query = " ".join(operation_query) 8390 operation_info_fields = operation.get("info_fields", []) 8391 operation_info_fields_check = operation.get("info_fields_check", False) 8392 operation_info = operation.get("operation_info", True) 8393 operation_table = operation.get( 8394 "table", self.get_table_variants(clause="alter") 8395 ) 8396 8397 # table variants 8398 if operation_table: 8399 table_variants = operation_table 8400 else: 8401 table_variants = self.get_table_variants(clause="alter") 8402 8403 if operation_query: 8404 8405 # Info fields check 8406 operation_info_fields_check_result = True 8407 if operation_info_fields_check: 8408 header_infos = self.get_header().infos 8409 for info_field in operation_info_fields: 8410 operation_info_fields_check_result = ( 8411 operation_info_fields_check_result 8412 and info_field in header_infos 8413 ) 8414 8415 # If info fields available 8416 if operation_info_fields_check_result: 8417 8418 # Added_columns 8419 added_columns = [] 8420 8421 # Create VCF header field 8422 vcf_reader = self.get_header() 8423 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8424 output_column_name, 8425 ".", 8426 output_column_type, 8427 output_column_description, 8428 "howard calculation", 8429 "0", 8430 self.code_type_map.get(output_column_type), 8431 ) 8432 8433 # Explode infos if needed 8434 log.debug(f"calculation_process_sql prefix {prefix}") 8435 added_columns += self.explode_infos( 8436 prefix=prefix, 8437 fields=[output_column_name] + operation_info_fields, 8438 force=False, 8439 table=table_variants, 8440 ) 8441 8442 # Create column 8443 added_column = self.add_column( 8444 table_name=table_variants, 8445 column_name=prefix + output_column_name, 8446 column_type=output_column_type_sql, 8447 default_value="null", 8448 ) 8449 added_columns.append(added_column) 8450 8451 # Operation calculation 8452 try: 8453 8454 # Query to update calculation column 8455 sql_update = f""" 8456 UPDATE {table_variants} 8457 SET "{prefix}{output_column_name}" = ({operation_query}) 8458 """ 8459 self.conn.execute(sql_update) 8460 8461 # Add to INFO 8462 if operation_info: 8463 sql_update_info = f""" 8464 UPDATE {table_variants} 8465 SET "INFO" = 8466 concat( 8467 CASE 8468 WHEN "INFO" IS NOT NULL 8469 THEN concat("INFO", ';') 8470 ELSE '' 8471 END, 8472 '{output_column_name}=', 8473 "{prefix}{output_column_name}" 8474 ) 8475 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8476 """ 8477 self.conn.execute(sql_update_info) 8478 8479 except: 8480 log.error( 8481 f"Operations config: Calculation '{operation_name}' query failed" 8482 ) 8483 raise ValueError( 8484 f"Operations config: Calculation '{operation_name}' query failed" 8485 ) 8486 8487 # Remove added columns 8488 for added_column in added_columns: 8489 log.debug(f"added_column: {added_column}") 8490 self.drop_column(column=added_column) 8491 8492 else: 8493 log.error( 8494 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8495 ) 8496 raise ValueError( 8497 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8498 ) 8499 8500 else: 8501 log.error( 8502 f"Operations config: Calculation '{operation_name}' query NOT defined" 8503 ) 8504 raise ValueError( 8505 f"Operations config: Calculation '{operation_name}' query NOT defined" 8506 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8508 def calculation_process_function( 8509 self, operation: dict, operation_name: str = "unknown" 8510 ) -> None: 8511 """ 8512 The `calculation_process_function` takes in an operation dictionary and performs the specified 8513 function with the given parameters. 8514 8515 :param operation: The `operation` parameter is a dictionary that contains information about the 8516 operation to be performed. It has the following keys: 8517 :type operation: dict 8518 :param operation_name: The `operation_name` parameter is a string that represents the name of 8519 the operation being performed. It is used for logging purposes, defaults to unknown 8520 :type operation_name: str (optional) 8521 """ 8522 8523 operation_name = operation["name"] 8524 log.debug(f"process Python {operation_name}") 8525 function_name = operation["function_name"] 8526 function_params = operation["function_params"] 8527 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8529 def calculation_variant_id(self) -> None: 8530 """ 8531 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8532 updates the INFO field of a variants table with the variant ID. 8533 """ 8534 8535 # variant_id annotation field 8536 variant_id_tag = self.get_variant_id_column() 8537 added_columns = [variant_id_tag] 8538 8539 # variant_id hgvs tags" 8540 vcf_infos_tags = { 8541 variant_id_tag: "howard variant ID annotation", 8542 } 8543 8544 # Variants table 8545 table_variants = self.get_table_variants() 8546 8547 # Header 8548 vcf_reader = self.get_header() 8549 8550 # Add variant_id to header 8551 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8552 variant_id_tag, 8553 ".", 8554 "String", 8555 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8556 "howard calculation", 8557 "0", 8558 self.code_type_map.get("String"), 8559 ) 8560 8561 # Update 8562 sql_update = f""" 8563 UPDATE {table_variants} 8564 SET "INFO" = 8565 concat( 8566 CASE 8567 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8568 THEN '' 8569 ELSE concat("INFO", ';') 8570 END, 8571 '{variant_id_tag}=', 8572 "{variant_id_tag}" 8573 ) 8574 """ 8575 self.conn.execute(sql_update) 8576 8577 # Remove added columns 8578 for added_column in added_columns: 8579 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8581 def calculation_extract_snpeff_hgvs( 8582 self, 8583 snpeff_hgvs: str = "snpeff_hgvs", 8584 snpeff_field: str = "ANN", 8585 ) -> None: 8586 """ 8587 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8588 annotation field in a VCF file and adds them as a new column in the variants table. 8589 8590 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8591 function is used to specify the name of the column that will store the HGVS nomenclatures 8592 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8593 snpeff_hgvs 8594 :type snpeff_hgvs: str (optional) 8595 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8596 function represents the field in the VCF file that contains SnpEff annotations. This field is 8597 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8598 to ANN 8599 :type snpeff_field: str (optional) 8600 """ 8601 8602 # Snpeff hgvs tags 8603 vcf_infos_tags = { 8604 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8605 } 8606 8607 # Prefix 8608 prefix = self.get_explode_infos_prefix() 8609 if prefix: 8610 prefix = "INFO/" 8611 8612 # snpEff fields 8613 speff_ann_infos = prefix + snpeff_field 8614 speff_hgvs_infos = prefix + snpeff_hgvs 8615 8616 # Variants table 8617 table_variants = self.get_table_variants() 8618 8619 # Header 8620 vcf_reader = self.get_header() 8621 8622 # Add columns 8623 added_columns = [] 8624 8625 # Explode HGVS field in column 8626 added_columns += self.explode_infos(fields=[snpeff_field]) 8627 8628 if snpeff_field in vcf_reader.infos: 8629 8630 log.debug(vcf_reader.infos[snpeff_field]) 8631 8632 # Extract ANN header 8633 ann_description = vcf_reader.infos[snpeff_field].desc 8634 pattern = r"'(.+?)'" 8635 match = re.search(pattern, ann_description) 8636 if match: 8637 ann_header_match = match.group(1).split(" | ") 8638 ann_header_desc = {} 8639 for i in range(len(ann_header_match)): 8640 ann_header_info = "".join( 8641 char for char in ann_header_match[i] if char.isalnum() 8642 ) 8643 ann_header_desc[ann_header_info] = ann_header_match[i] 8644 if not ann_header_desc: 8645 raise ValueError("Invalid header description format") 8646 else: 8647 raise ValueError("Invalid header description format") 8648 8649 # Create variant id 8650 variant_id_column = self.get_variant_id_column() 8651 added_columns += [variant_id_column] 8652 8653 # Create dataframe 8654 dataframe_snpeff_hgvs = self.get_query_to_df( 8655 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8656 ) 8657 8658 # Create main NOMEN column 8659 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8660 speff_ann_infos 8661 ].apply( 8662 lambda x: extract_snpeff_hgvs( 8663 str(x), header=list(ann_header_desc.values()) 8664 ) 8665 ) 8666 8667 # Add snpeff_hgvs to header 8668 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8669 snpeff_hgvs, 8670 ".", 8671 "String", 8672 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8673 "howard calculation", 8674 "0", 8675 self.code_type_map.get("String"), 8676 ) 8677 8678 # Update 8679 sql_update = f""" 8680 UPDATE variants 8681 SET "INFO" = 8682 concat( 8683 CASE 8684 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8685 THEN '' 8686 ELSE concat("INFO", ';') 8687 END, 8688 CASE 8689 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8690 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8691 THEN concat( 8692 '{snpeff_hgvs}=', 8693 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8694 ) 8695 ELSE '' 8696 END 8697 ) 8698 FROM dataframe_snpeff_hgvs 8699 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8700 8701 """ 8702 self.conn.execute(sql_update) 8703 8704 # Delete dataframe 8705 del dataframe_snpeff_hgvs 8706 gc.collect() 8707 8708 else: 8709 8710 log.warning( 8711 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8712 ) 8713 8714 # Remove added columns 8715 for added_column in added_columns: 8716 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8718 def calculation_snpeff_ann_explode( 8719 self, 8720 uniquify: bool = True, 8721 output_format: str = "fields", 8722 output_prefix: str = "snpeff_", 8723 snpeff_field: str = "ANN", 8724 ) -> None: 8725 """ 8726 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8727 exploding the HGVS field and updating variant information accordingly. 8728 8729 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8730 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8731 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8732 defaults to True 8733 :type uniquify: bool (optional) 8734 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8735 function specifies the format in which the output annotations will be generated. It has a 8736 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8737 format, defaults to fields 8738 :type output_format: str (optional) 8739 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8740 method is used to specify the prefix that will be added to the output annotations generated 8741 during the calculation process. This prefix helps to differentiate the newly added annotations 8742 from existing ones in the output data. By default, the, defaults to ANN_ 8743 :type output_prefix: str (optional) 8744 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8745 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8746 field will be processed to explode the HGVS annotations and update the variant information 8747 accordingly, defaults to ANN 8748 :type snpeff_field: str (optional) 8749 """ 8750 8751 # SnpEff annotation field 8752 snpeff_hgvs = "snpeff_ann_explode" 8753 8754 # Snpeff hgvs tags 8755 vcf_infos_tags = { 8756 snpeff_hgvs: "Explode snpEff annotations", 8757 } 8758 8759 # Prefix 8760 prefix = self.get_explode_infos_prefix() 8761 if prefix: 8762 prefix = "INFO/" 8763 8764 # snpEff fields 8765 speff_ann_infos = prefix + snpeff_field 8766 speff_hgvs_infos = prefix + snpeff_hgvs 8767 8768 # Variants table 8769 table_variants = self.get_table_variants() 8770 8771 # Header 8772 vcf_reader = self.get_header() 8773 8774 # Add columns 8775 added_columns = [] 8776 8777 # Explode HGVS field in column 8778 added_columns += self.explode_infos(fields=[snpeff_field]) 8779 log.debug(f"snpeff_field={snpeff_field}") 8780 log.debug(f"added_columns={added_columns}") 8781 8782 if snpeff_field in vcf_reader.infos: 8783 8784 # Extract ANN header 8785 ann_description = vcf_reader.infos[snpeff_field].desc 8786 pattern = r"'(.+?)'" 8787 match = re.search(pattern, ann_description) 8788 if match: 8789 ann_header_match = match.group(1).split(" | ") 8790 ann_header = [] 8791 ann_header_desc = {} 8792 for i in range(len(ann_header_match)): 8793 ann_header_info = "".join( 8794 char for char in ann_header_match[i] if char.isalnum() 8795 ) 8796 ann_header.append(ann_header_info) 8797 ann_header_desc[ann_header_info] = ann_header_match[i] 8798 if not ann_header_desc: 8799 raise ValueError("Invalid header description format") 8800 else: 8801 raise ValueError("Invalid header description format") 8802 8803 # Create variant id 8804 variant_id_column = self.get_variant_id_column() 8805 added_columns += [variant_id_column] 8806 8807 # Create dataframe 8808 dataframe_snpeff_hgvs = self.get_query_to_df( 8809 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8810 ) 8811 8812 # Create snpEff columns 8813 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8814 speff_ann_infos 8815 ].apply( 8816 lambda x: explode_snpeff_ann( 8817 str(x), 8818 uniquify=uniquify, 8819 output_format=output_format, 8820 prefix=output_prefix, 8821 header=list(ann_header_desc.values()), 8822 ) 8823 ) 8824 8825 # Header 8826 ann_annotations_prefix = "" 8827 if output_format.upper() in ["JSON"]: 8828 ann_annotations_prefix = f"{output_prefix}=" 8829 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8830 output_prefix, 8831 ".", 8832 "String", 8833 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8834 + " - JSON format", 8835 "howard calculation", 8836 "0", 8837 self.code_type_map.get("String"), 8838 ) 8839 else: 8840 for ann_annotation in ann_header: 8841 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8842 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8843 ann_annotation_id, 8844 ".", 8845 "String", 8846 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8847 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8848 "howard calculation", 8849 "0", 8850 self.code_type_map.get("String"), 8851 ) 8852 8853 # Update 8854 sql_update = f""" 8855 UPDATE variants 8856 SET "INFO" = 8857 concat( 8858 CASE 8859 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8860 THEN '' 8861 ELSE concat("INFO", ';') 8862 END, 8863 CASE 8864 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8865 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8866 THEN concat( 8867 '{ann_annotations_prefix}', 8868 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8869 ) 8870 ELSE '' 8871 END 8872 ) 8873 FROM dataframe_snpeff_hgvs 8874 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8875 8876 """ 8877 self.conn.execute(sql_update) 8878 8879 # Delete dataframe 8880 del dataframe_snpeff_hgvs 8881 gc.collect() 8882 8883 else: 8884 8885 log.warning( 8886 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8887 ) 8888 8889 # Remove added columns 8890 for added_column in added_columns: 8891 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8893 def calculation_extract_nomen(self) -> None: 8894 """ 8895 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8896 """ 8897 8898 # NOMEN field 8899 field_nomen_dict = "NOMEN_DICT" 8900 8901 # NOMEN structure 8902 nomen_dict = { 8903 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8904 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8905 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8906 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8907 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8908 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8909 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8910 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8911 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8912 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8913 } 8914 8915 # Param 8916 param = self.get_param() 8917 8918 # Threads 8919 threads = self.get_threads() 8920 8921 # Prefix 8922 prefix = self.get_explode_infos_prefix() 8923 8924 # Header 8925 vcf_reader = self.get_header() 8926 8927 # Added columns 8928 added_columns = [] 8929 8930 # Get HGVS field 8931 hgvs_field = ( 8932 param.get("calculation", {}) 8933 .get("calculations", {}) 8934 .get("NOMEN", {}) 8935 .get("options", {}) 8936 .get("hgvs_field", "hgvs") 8937 ) 8938 8939 # Get NOMEN pattern 8940 nomen_pattern = ( 8941 param.get("calculation", {}) 8942 .get("calculations", {}) 8943 .get("NOMEN", {}) 8944 .get("options", {}) 8945 .get("pattern", None) 8946 ) 8947 8948 # transcripts list of preference sources 8949 transcripts_sources = {} 8950 8951 # Get transcripts 8952 transcripts_file = ( 8953 param.get("calculation", {}) 8954 .get("calculations", {}) 8955 .get("NOMEN", {}) 8956 .get("options", {}) 8957 .get("transcripts", None) 8958 ) 8959 transcripts_file = full_path(transcripts_file) 8960 if transcripts_file: 8961 if os.path.exists(transcripts_file): 8962 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8963 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8964 transcripts_sources["file"] = transcripts_from_file 8965 else: 8966 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8967 log.error(msg_err) 8968 raise ValueError(msg_err) 8969 8970 # Get transcripts table 8971 transcripts_table = ( 8972 param.get("calculation", {}) 8973 .get("calculations", {}) 8974 .get("NOMEN", {}) 8975 .get("options", {}) 8976 .get("transcripts_table", self.get_table_variants()) 8977 ) 8978 # Get transcripts column 8979 transcripts_column = ( 8980 param.get("calculation", {}) 8981 .get("calculations", {}) 8982 .get("NOMEN", {}) 8983 .get("options", {}) 8984 .get("transcripts_column", None) 8985 ) 8986 8987 if transcripts_table and transcripts_column: 8988 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8989 # Explode if not exists 8990 added_columns += self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8991 else: 8992 extra_field_transcript = f"NULL" 8993 8994 # Transcripts of preference source order 8995 transcripts_order = ( 8996 param.get("calculation", {}) 8997 .get("calculations", {}) 8998 .get("NOMEN", {}) 8999 .get("options", {}) 9000 .get("transcripts_order", ["column", "file"]) 9001 ) 9002 9003 # Transcripts from file 9004 transcripts = transcripts_sources.get("file", []) 9005 9006 # Explode HGVS field in column 9007 added_columns += self.explode_infos(fields=[hgvs_field]) 9008 9009 # extra infos 9010 extra_infos = self.get_extra_infos() 9011 extra_field = prefix + hgvs_field 9012 9013 if extra_field in extra_infos: 9014 9015 # Create dataframe 9016 dataframe_hgvs = self.get_query_to_df( 9017 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 9018 ) 9019 9020 # Transcripts rank 9021 transcripts_rank = {transcript: rank for rank, transcript in enumerate(transcripts, start=1)} 9022 transcripts_len = len(transcripts_rank) 9023 9024 # Create main NOMEN column 9025 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 9026 lambda x: find_nomen( 9027 hgvs=x.hgvs, 9028 transcript=x.transcript, 9029 transcripts=transcripts_rank, 9030 pattern=nomen_pattern, 9031 transcripts_source_order=transcripts_order, 9032 transcripts_len=transcripts_len 9033 ), 9034 axis=1, 9035 ) 9036 9037 # Explode NOMEN Structure and create SQL set for update 9038 sql_nomen_fields = [] 9039 for nomen_field in nomen_dict: 9040 9041 # Create VCF header field 9042 vcf_reader.infos[nomen_field] = vcf.parser._Info( 9043 nomen_field, 9044 ".", 9045 "String", 9046 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 9047 "howard calculation", 9048 "0", 9049 self.code_type_map.get("String"), 9050 ) 9051 9052 # Add field to SQL query update 9053 sql_nomen_fields.append( 9054 f""" 9055 CASE 9056 WHEN dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT NULL AND dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" NOT IN ('') 9057 THEN concat( 9058 ';{nomen_field}=', 9059 dataframe_hgvs."{field_nomen_dict}"."{nomen_field}" 9060 ) 9061 ELSE '' 9062 END 9063 """ 9064 ) 9065 9066 # SQL set for update 9067 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 9068 9069 # Update 9070 sql_update = f""" 9071 UPDATE variants 9072 SET "INFO" = 9073 concat( 9074 CASE 9075 WHEN "INFO" IS NULL 9076 THEN '' 9077 ELSE "INFO" 9078 END, 9079 {sql_nomen_fields_set} 9080 ) 9081 FROM dataframe_hgvs 9082 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 9083 AND variants."POS" = dataframe_hgvs."POS" 9084 AND variants."REF" = dataframe_hgvs."REF" 9085 AND variants."ALT" = dataframe_hgvs."ALT" 9086 """ 9087 self.conn.execute(sql_update) 9088 9089 # Delete dataframe 9090 del dataframe_hgvs 9091 gc.collect() 9092 9093 # Remove added columns 9094 for added_column in added_columns: 9095 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9097 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9098 """ 9099 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9100 pipeline/sample for a variant and updates the variant information in a VCF file. 9101 9102 :param tag: The `tag` parameter is a string that represents the annotation field for the 9103 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9104 VCF header and to update the corresponding field in the variants table, defaults to 9105 findbypipeline 9106 :type tag: str (optional) 9107 """ 9108 9109 # if FORMAT and samples 9110 if ( 9111 "FORMAT" in self.get_header_columns_as_list() 9112 and self.get_header_sample_list() 9113 ): 9114 9115 # findbypipeline annotation field 9116 findbypipeline_tag = tag 9117 9118 # VCF infos tags 9119 vcf_infos_tags = { 9120 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9121 } 9122 9123 # Prefix 9124 prefix = self.get_explode_infos_prefix() 9125 9126 # Field 9127 findbypipeline_infos = prefix + findbypipeline_tag 9128 9129 # Variants table 9130 table_variants = self.get_table_variants() 9131 9132 # Header 9133 vcf_reader = self.get_header() 9134 9135 # Create variant id 9136 variant_id_column = self.get_variant_id_column() 9137 added_columns = [variant_id_column] 9138 9139 # variant_id, FORMAT and samples 9140 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9141 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9142 ) 9143 9144 # Create dataframe 9145 dataframe_findbypipeline = self.get_query_to_df( 9146 f""" SELECT {samples_fields} FROM {table_variants} """ 9147 ) 9148 9149 # Create findbypipeline column 9150 dataframe_findbypipeline[findbypipeline_infos] = ( 9151 dataframe_findbypipeline.apply( 9152 lambda row: findbypipeline( 9153 row, samples=self.get_header_sample_list() 9154 ), 9155 axis=1, 9156 ) 9157 ) 9158 9159 # Add snpeff_hgvs to header 9160 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9161 findbypipeline_tag, 9162 ".", 9163 "String", 9164 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9165 "howard calculation", 9166 "0", 9167 self.code_type_map.get("String"), 9168 ) 9169 9170 # Update 9171 sql_update = f""" 9172 UPDATE variants 9173 SET "INFO" = 9174 concat( 9175 CASE 9176 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9177 THEN '' 9178 ELSE concat("INFO", ';') 9179 END, 9180 CASE 9181 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9182 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9183 THEN concat( 9184 '{findbypipeline_tag}=', 9185 dataframe_findbypipeline."{findbypipeline_infos}" 9186 ) 9187 ELSE '' 9188 END 9189 ) 9190 FROM dataframe_findbypipeline 9191 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9192 """ 9193 self.conn.execute(sql_update) 9194 9195 # Remove added columns 9196 for added_column in added_columns: 9197 self.drop_column(column=added_column) 9198 9199 # Delete dataframe 9200 del dataframe_findbypipeline 9201 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9203 def calculation_genotype_concordance(self) -> None: 9204 """ 9205 The function `calculation_genotype_concordance` calculates the genotype concordance for 9206 multi-caller VCF files and updates the variant information in the database. 9207 """ 9208 9209 # if FORMAT and samples 9210 if ( 9211 "FORMAT" in self.get_header_columns_as_list() 9212 and self.get_header_sample_list() 9213 ): 9214 9215 # genotypeconcordance annotation field 9216 genotypeconcordance_tag = "genotypeconcordance" 9217 9218 # VCF infos tags 9219 vcf_infos_tags = { 9220 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9221 } 9222 9223 # Prefix 9224 prefix = self.get_explode_infos_prefix() 9225 9226 # Field 9227 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9228 9229 # Variants table 9230 table_variants = self.get_table_variants() 9231 9232 # Header 9233 vcf_reader = self.get_header() 9234 9235 # Create variant id 9236 variant_id_column = self.get_variant_id_column() 9237 added_columns = [variant_id_column] 9238 9239 # variant_id, FORMAT and samples 9240 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9241 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9242 ) 9243 9244 # Create dataframe 9245 dataframe_genotypeconcordance = self.get_query_to_df( 9246 f""" SELECT {samples_fields} FROM {table_variants} """ 9247 ) 9248 9249 # Create genotypeconcordance column 9250 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9251 dataframe_genotypeconcordance.apply( 9252 lambda row: genotypeconcordance( 9253 row, samples=self.get_header_sample_list() 9254 ), 9255 axis=1, 9256 ) 9257 ) 9258 9259 # Add genotypeconcordance to header 9260 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9261 genotypeconcordance_tag, 9262 ".", 9263 "String", 9264 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9265 "howard calculation", 9266 "0", 9267 self.code_type_map.get("String"), 9268 ) 9269 9270 # Update 9271 sql_update = f""" 9272 UPDATE variants 9273 SET "INFO" = 9274 concat( 9275 CASE 9276 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9277 THEN '' 9278 ELSE concat("INFO", ';') 9279 END, 9280 CASE 9281 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9282 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9283 THEN concat( 9284 '{genotypeconcordance_tag}=', 9285 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9286 ) 9287 ELSE '' 9288 END 9289 ) 9290 FROM dataframe_genotypeconcordance 9291 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9292 """ 9293 self.conn.execute(sql_update) 9294 9295 # Remove added columns 9296 for added_column in added_columns: 9297 self.drop_column(column=added_column) 9298 9299 # Delete dataframe 9300 del dataframe_genotypeconcordance 9301 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9303 def calculation_barcode(self, tag: str = "barcode") -> None: 9304 """ 9305 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9306 updates the INFO field in the file with the calculated barcode values. 9307 9308 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9309 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9310 the default tag name is set to "barcode", defaults to barcode 9311 :type tag: str (optional) 9312 """ 9313 9314 # if FORMAT and samples 9315 if ( 9316 "FORMAT" in self.get_header_columns_as_list() 9317 and self.get_header_sample_list() 9318 ): 9319 9320 # barcode annotation field 9321 if not tag: 9322 tag = "barcode" 9323 9324 # VCF infos tags 9325 vcf_infos_tags = { 9326 tag: "barcode calculation (VaRank)", 9327 } 9328 9329 # Prefix 9330 prefix = self.get_explode_infos_prefix() 9331 9332 # Field 9333 barcode_infos = prefix + tag 9334 9335 # Variants table 9336 table_variants = self.get_table_variants() 9337 9338 # Header 9339 vcf_reader = self.get_header() 9340 9341 # Create variant id 9342 variant_id_column = self.get_variant_id_column() 9343 added_columns = [variant_id_column] 9344 9345 # variant_id, FORMAT and samples 9346 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9347 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9348 ) 9349 9350 # Create dataframe 9351 dataframe_barcode = self.get_query_to_df( 9352 f""" SELECT {samples_fields} FROM {table_variants} """ 9353 ) 9354 9355 # Create barcode column 9356 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9357 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9358 ) 9359 9360 # Add barcode to header 9361 vcf_reader.infos[tag] = vcf.parser._Info( 9362 tag, 9363 ".", 9364 "String", 9365 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9366 "howard calculation", 9367 "0", 9368 self.code_type_map.get("String"), 9369 ) 9370 9371 # Update 9372 sql_update = f""" 9373 UPDATE {table_variants} 9374 SET "INFO" = 9375 concat( 9376 CASE 9377 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9378 THEN '' 9379 ELSE concat("INFO", ';') 9380 END, 9381 CASE 9382 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9383 AND dataframe_barcode."{barcode_infos}" NOT NULL 9384 THEN concat( 9385 '{tag}=', 9386 dataframe_barcode."{barcode_infos}" 9387 ) 9388 ELSE '' 9389 END 9390 ) 9391 FROM dataframe_barcode 9392 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9393 """ 9394 self.conn.execute(sql_update) 9395 9396 # Remove added columns 9397 for added_column in added_columns: 9398 self.drop_column(column=added_column) 9399 9400 # Delete dataframe 9401 del dataframe_barcode 9402 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9404 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9405 """ 9406 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9407 and updates the INFO field in the file with the calculated barcode values. 9408 9409 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9410 the barcode tag that will be added to the VCF file during the calculation process. If no value 9411 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9412 :type tag: str (optional) 9413 """ 9414 9415 # if FORMAT and samples 9416 if ( 9417 "FORMAT" in self.get_header_columns_as_list() 9418 and self.get_header_sample_list() 9419 ): 9420 9421 # barcode annotation field 9422 if not tag: 9423 tag = "BCF" 9424 9425 # VCF infos tags 9426 vcf_infos_tags = { 9427 tag: "barcode family calculation", 9428 f"{tag}S": "barcode family samples", 9429 } 9430 9431 # Param 9432 param = self.get_param() 9433 log.debug(f"param={param}") 9434 9435 # Prefix 9436 prefix = self.get_explode_infos_prefix() 9437 9438 # PED param 9439 ped = ( 9440 param.get("calculation", {}) 9441 .get("calculations", {}) 9442 .get("BARCODEFAMILY", {}) 9443 .get("family_pedigree", None) 9444 ) 9445 log.debug(f"ped={ped}") 9446 9447 # Load PED 9448 if ped: 9449 9450 # Pedigree is a file 9451 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9452 log.debug("Pedigree is file") 9453 with open(full_path(ped)) as ped: 9454 ped = yaml.safe_load(ped) 9455 9456 # Pedigree is a string 9457 elif isinstance(ped, str): 9458 log.debug("Pedigree is str") 9459 try: 9460 ped = json.loads(ped) 9461 log.debug("Pedigree is json str") 9462 except ValueError as e: 9463 ped_samples = ped.split(",") 9464 ped = {} 9465 for ped_sample in ped_samples: 9466 ped[ped_sample] = ped_sample 9467 9468 # Pedigree is a dict 9469 elif isinstance(ped, dict): 9470 log.debug("Pedigree is dict") 9471 9472 # Pedigree is not well formatted 9473 else: 9474 msg_error = "Pedigree not well formatted" 9475 log.error(msg_error) 9476 raise ValueError(msg_error) 9477 9478 # Construct list 9479 ped_samples = list(ped.values()) 9480 9481 else: 9482 log.debug("Pedigree not defined. Take all samples") 9483 ped_samples = self.get_header_sample_list() 9484 ped = {} 9485 for ped_sample in ped_samples: 9486 ped[ped_sample] = ped_sample 9487 9488 # Check pedigree 9489 if not ped or len(ped) == 0: 9490 msg_error = f"Error in pedigree: samples {ped_samples}" 9491 log.error(msg_error) 9492 raise ValueError(msg_error) 9493 9494 # Log 9495 log.info( 9496 "Calculation 'BARCODEFAMILY' - Samples: " 9497 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9498 ) 9499 log.debug(f"ped_samples={ped_samples}") 9500 9501 # Field 9502 barcode_infos = prefix + tag 9503 9504 # Variants table 9505 table_variants = self.get_table_variants() 9506 9507 # Header 9508 vcf_reader = self.get_header() 9509 9510 # Create variant id 9511 variant_id_column = self.get_variant_id_column() 9512 added_columns = [variant_id_column] 9513 9514 # variant_id, FORMAT and samples 9515 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9516 [f""" "{sample}" """ for sample in ped_samples] 9517 ) 9518 9519 # Create dataframe 9520 dataframe_barcode = self.get_query_to_df( 9521 f""" SELECT {samples_fields} FROM {table_variants} """ 9522 ) 9523 9524 # Create barcode column 9525 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9526 lambda row: barcode(row, samples=ped_samples), axis=1 9527 ) 9528 9529 # Add barcode family to header 9530 # Add vaf_normalization to header 9531 vcf_reader.formats[tag] = vcf.parser._Format( 9532 id=tag, 9533 num=".", 9534 type="String", 9535 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9536 type_code=self.code_type_map.get("String"), 9537 ) 9538 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9539 id=f"{tag}S", 9540 num=".", 9541 type="String", 9542 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9543 type_code=self.code_type_map.get("String"), 9544 ) 9545 9546 # Update 9547 # for sample in ped_samples: 9548 sql_update_set = [] 9549 for sample in self.get_header_sample_list() + ["FORMAT"]: 9550 if sample in ped_samples: 9551 value = f'dataframe_barcode."{barcode_infos}"' 9552 value_samples = "'" + ",".join([f""" "{sample}" """ for sample in ped_samples]) + "'" 9553 ped_samples 9554 elif sample == "FORMAT": 9555 value = f"'{tag}'" 9556 value_samples = f"'{tag}S'" 9557 else: 9558 value = "'.'" 9559 value_samples = "'.'" 9560 format_regex = r"[a-zA-Z0-9\s]" 9561 sql_update_set.append( 9562 f""" 9563 "{sample}" = 9564 concat( 9565 CASE 9566 WHEN {table_variants}."{sample}" = './.' 9567 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9568 ELSE {table_variants}."{sample}" 9569 END, 9570 ':', 9571 {value}, 9572 ':', 9573 {value_samples} 9574 ) 9575 """ 9576 ) 9577 9578 sql_update_set_join = ", ".join(sql_update_set) 9579 sql_update = f""" 9580 UPDATE {table_variants} 9581 SET {sql_update_set_join} 9582 FROM dataframe_barcode 9583 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9584 """ 9585 self.conn.execute(sql_update) 9586 9587 # Remove added columns 9588 for added_column in added_columns: 9589 self.drop_column(column=added_column) 9590 9591 # Delete dataframe 9592 del dataframe_barcode 9593 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9595 def calculation_trio(self) -> None: 9596 """ 9597 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9598 information to the INFO field of each variant. 9599 """ 9600 9601 # if FORMAT and samples 9602 if ( 9603 "FORMAT" in self.get_header_columns_as_list() 9604 and self.get_header_sample_list() 9605 ): 9606 9607 # trio annotation field 9608 trio_tag = "trio" 9609 9610 # VCF infos tags 9611 vcf_infos_tags = { 9612 "trio": "trio calculation", 9613 } 9614 9615 # Param 9616 param = self.get_param() 9617 9618 # Prefix 9619 prefix = self.get_explode_infos_prefix() 9620 9621 # Trio param 9622 trio_ped = ( 9623 param.get("calculation", {}) 9624 .get("calculations", {}) 9625 .get("TRIO", {}) 9626 .get("trio_pedigree", None) 9627 ) 9628 9629 # Load trio 9630 if trio_ped: 9631 9632 # Trio pedigree is a file 9633 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9634 log.debug("TRIO pedigree is file") 9635 with open(full_path(trio_ped)) as trio_ped: 9636 trio_ped = yaml.safe_load(trio_ped) 9637 9638 # Trio pedigree is a string 9639 elif isinstance(trio_ped, str): 9640 log.debug("TRIO pedigree is str") 9641 try: 9642 trio_ped = json.loads(trio_ped) 9643 log.debug("TRIO pedigree is json str") 9644 except ValueError as e: 9645 trio_samples = trio_ped.split(",") 9646 if len(trio_samples) == 3: 9647 trio_ped = { 9648 "father": trio_samples[0], 9649 "mother": trio_samples[1], 9650 "child": trio_samples[2], 9651 } 9652 log.debug("TRIO pedigree is list str") 9653 else: 9654 msg_error = "TRIO pedigree not well formatted" 9655 log.error(msg_error) 9656 raise ValueError(msg_error) 9657 9658 # Trio pedigree is a dict 9659 elif isinstance(trio_ped, dict): 9660 log.debug("TRIO pedigree is dict") 9661 9662 # Trio pedigree is not well formatted 9663 else: 9664 msg_error = "TRIO pedigree not well formatted" 9665 log.error(msg_error) 9666 raise ValueError(msg_error) 9667 9668 # Construct trio list 9669 trio_samples = [ 9670 trio_ped.get("father", ""), 9671 trio_ped.get("mother", ""), 9672 trio_ped.get("child", ""), 9673 ] 9674 9675 else: 9676 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9677 samples_list = self.get_header_sample_list() 9678 if len(samples_list) >= 3: 9679 trio_samples = self.get_header_sample_list()[0:3] 9680 trio_ped = { 9681 "father": trio_samples[0], 9682 "mother": trio_samples[1], 9683 "child": trio_samples[2], 9684 } 9685 else: 9686 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9687 log.error(msg_error) 9688 raise ValueError(msg_error) 9689 9690 # Check trio pedigree 9691 if not trio_ped or len(trio_ped) != 3: 9692 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9693 log.error(msg_error) 9694 raise ValueError(msg_error) 9695 9696 # Log 9697 log.info( 9698 f"Calculation 'TRIO' - Samples: " 9699 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9700 ) 9701 9702 # Field 9703 trio_infos = prefix + trio_tag 9704 9705 # Variants table 9706 table_variants = self.get_table_variants() 9707 9708 # Header 9709 vcf_reader = self.get_header() 9710 9711 # Create variant id 9712 variant_id_column = self.get_variant_id_column() 9713 added_columns = [variant_id_column] 9714 9715 # variant_id, FORMAT and samples 9716 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9717 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9718 ) 9719 9720 # Create dataframe 9721 dataframe_trio = self.get_query_to_df( 9722 f""" SELECT {samples_fields} FROM {table_variants} """ 9723 ) 9724 9725 # Create trio column 9726 dataframe_trio[trio_infos] = dataframe_trio.apply( 9727 lambda row: trio(row, samples=trio_samples), axis=1 9728 ) 9729 9730 # Add trio to header 9731 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9732 trio_tag, 9733 ".", 9734 "String", 9735 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9736 "howard calculation", 9737 "0", 9738 self.code_type_map.get("String"), 9739 ) 9740 9741 # Update 9742 sql_update = f""" 9743 UPDATE {table_variants} 9744 SET "INFO" = 9745 concat( 9746 CASE 9747 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9748 THEN '' 9749 ELSE concat("INFO", ';') 9750 END, 9751 CASE 9752 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9753 AND dataframe_trio."{trio_infos}" NOT NULL 9754 THEN concat( 9755 '{trio_tag}=', 9756 dataframe_trio."{trio_infos}" 9757 ) 9758 ELSE '' 9759 END 9760 ) 9761 FROM dataframe_trio 9762 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9763 """ 9764 self.conn.execute(sql_update) 9765 9766 # Remove added columns 9767 for added_column in added_columns: 9768 self.drop_column(column=added_column) 9769 9770 # Delete dataframe 9771 del dataframe_trio 9772 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9774 def calculation_vaf_normalization(self) -> None: 9775 """ 9776 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9777 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9778 :return: The function does not return anything. 9779 """ 9780 9781 # if FORMAT and samples 9782 if ( 9783 "FORMAT" in self.get_header_columns_as_list() 9784 and self.get_header_sample_list() 9785 ): 9786 9787 # vaf_normalization annotation field 9788 vaf_normalization_tag = "VAF" 9789 9790 # VCF infos tags 9791 vcf_infos_tags = { 9792 "VAF": "VAF Variant Frequency", 9793 } 9794 9795 # Prefix 9796 prefix = self.get_explode_infos_prefix() 9797 9798 # Variants table 9799 table_variants = self.get_table_variants() 9800 9801 # Header 9802 vcf_reader = self.get_header() 9803 9804 # Do not calculate if VAF already exists 9805 if "VAF" in vcf_reader.formats: 9806 log.debug("VAF already on genotypes") 9807 return 9808 9809 # Create variant id 9810 variant_id_column = self.get_variant_id_column() 9811 added_columns = [variant_id_column] 9812 9813 # variant_id, FORMAT and samples 9814 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9815 f""" "{sample}" """ for sample in self.get_header_sample_list() 9816 ) 9817 9818 # Create dataframe 9819 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9820 log.debug(f"query={query}") 9821 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9822 9823 vaf_normalization_set = [] 9824 9825 # for each sample vaf_normalization 9826 for sample in self.get_header_sample_list(): 9827 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9828 lambda row: vaf_normalization(row, sample=sample), axis=1 9829 ) 9830 vaf_normalization_set.append( 9831 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9832 ) 9833 9834 # Add VAF to FORMAT 9835 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9836 "FORMAT" 9837 ].apply(lambda x: str(x) + ":VAF") 9838 vaf_normalization_set.append( 9839 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9840 ) 9841 9842 # Add vaf_normalization to header 9843 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9844 id=vaf_normalization_tag, 9845 num="1", 9846 type="Float", 9847 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9848 type_code=self.code_type_map.get("Float"), 9849 ) 9850 9851 # Create fields to add in INFO 9852 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9853 9854 # Update 9855 sql_update = f""" 9856 UPDATE {table_variants} 9857 SET {sql_vaf_normalization_set} 9858 FROM dataframe_vaf_normalization 9859 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9860 9861 """ 9862 self.conn.execute(sql_update) 9863 9864 # Remove added columns 9865 for added_column in added_columns: 9866 self.drop_column(column=added_column) 9867 9868 # Delete dataframe 9869 del dataframe_vaf_normalization 9870 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9872 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9873 """ 9874 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9875 field in a VCF file and updates the INFO column of the variants table with the calculated 9876 statistics. 9877 9878 :param info: The `info` parameter is a string that represents the type of information for which 9879 genotype statistics are calculated. It is used to generate various VCF info tags for the 9880 statistics, such as the number of occurrences, the list of values, the minimum value, the 9881 maximum value, the mean, the median, defaults to VAF 9882 :type info: str (optional) 9883 """ 9884 9885 # if FORMAT and samples 9886 if ( 9887 "FORMAT" in self.get_header_columns_as_list() 9888 and self.get_header_sample_list() 9889 ): 9890 9891 # vaf_stats annotation field 9892 vaf_stats_tag = info + "_stats" 9893 9894 # VCF infos tags 9895 vcf_infos_tags = { 9896 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9897 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9898 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9899 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9900 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9901 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9902 info 9903 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9904 } 9905 9906 # Prefix 9907 prefix = self.get_explode_infos_prefix() 9908 9909 # Field 9910 vaf_stats_infos = prefix + vaf_stats_tag 9911 9912 # Variants table 9913 table_variants = self.get_table_variants() 9914 9915 # Header 9916 vcf_reader = self.get_header() 9917 9918 # Create variant id 9919 variant_id_column = self.get_variant_id_column() 9920 added_columns = [variant_id_column] 9921 9922 # variant_id, FORMAT and samples 9923 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9924 [f""" "{sample}" """ for sample in self.get_header_sample_list()] 9925 ) 9926 9927 # Create dataframe 9928 dataframe_vaf_stats = self.get_query_to_df( 9929 f""" SELECT {samples_fields} FROM {table_variants} """ 9930 ) 9931 9932 # Create vaf_stats column 9933 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9934 lambda row: genotype_stats( 9935 row, samples=self.get_header_sample_list(), info=info 9936 ), 9937 axis=1, 9938 ) 9939 9940 # List of vcf tags 9941 sql_vaf_stats_fields = [] 9942 9943 # Check all VAF stats infos 9944 for stat in vcf_infos_tags: 9945 9946 # Extract stats 9947 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9948 lambda x: dict(x).get(stat, "") 9949 ) 9950 9951 # Add snpeff_hgvs to header 9952 vcf_reader.infos[stat] = vcf.parser._Info( 9953 stat, 9954 ".", 9955 "String", 9956 vcf_infos_tags.get(stat, "genotype statistics"), 9957 "howard calculation", 9958 "0", 9959 self.code_type_map.get("String"), 9960 ) 9961 9962 if len(sql_vaf_stats_fields): 9963 sep = ";" 9964 else: 9965 sep = "" 9966 9967 # Create fields to add in INFO 9968 sql_vaf_stats_fields.append( 9969 f""" 9970 CASE 9971 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9972 THEN concat( 9973 '{sep}{stat}=', 9974 dataframe_vaf_stats."{stat}" 9975 ) 9976 ELSE '' 9977 END 9978 """ 9979 ) 9980 9981 # SQL set for update 9982 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9983 9984 # Update 9985 sql_update = f""" 9986 UPDATE {table_variants} 9987 SET "INFO" = 9988 concat( 9989 CASE 9990 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9991 THEN '' 9992 ELSE concat("INFO", ';') 9993 END, 9994 {sql_vaf_stats_fields_set} 9995 ) 9996 FROM dataframe_vaf_stats 9997 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9998 9999 """ 10000 self.conn.execute(sql_update) 10001 10002 # Remove added columns 10003 for added_column in added_columns: 10004 self.drop_column(column=added_column) 10005 10006 # Delete dataframe 10007 del dataframe_vaf_stats 10008 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
10010 def calculation_transcripts_annotation( 10011 self, info_json: str = None, info_format: str = None 10012 ) -> None: 10013 """ 10014 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 10015 field to it if transcripts are available. 10016 10017 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 10018 is a string parameter that represents the information field to be used in the transcripts JSON. 10019 It is used to specify the JSON format for the transcripts information. If no value is provided 10020 when calling the method, it defaults to " 10021 :type info_json: str 10022 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 10023 method is a string parameter that specifies the format of the information field to be used in 10024 the transcripts JSON. It is used to define the format of the information field 10025 :type info_format: str 10026 """ 10027 10028 # Create transcripts table 10029 transcripts_table = self.create_transcript_view() 10030 10031 # Add info field 10032 if transcripts_table: 10033 self.transcript_view_to_variants( 10034 transcripts_table=transcripts_table, 10035 transcripts_info_field_json=info_json, 10036 transcripts_info_field_format=info_format, 10037 ) 10038 else: 10039 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
10041 def calculation_transcripts_prioritization(self) -> None: 10042 """ 10043 The function `calculation_transcripts_prioritization` creates a transcripts table and 10044 prioritizes transcripts based on certain criteria. 10045 """ 10046 10047 # Create transcripts table 10048 transcripts_table = self.create_transcript_view() 10049 10050 # Add info field 10051 if transcripts_table: 10052 self.transcripts_prioritization(transcripts_table=transcripts_table) 10053 else: 10054 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
10056 def calculation_transcripts_export(self) -> None: 10057 """ """ 10058 10059 # Create transcripts table 10060 transcripts_table = self.create_transcript_view() 10061 10062 # Add info field 10063 if transcripts_table: 10064 self.transcripts_export(transcripts_table=transcripts_table) 10065 else: 10066 log.info("No Transcripts to process. Check param.json file configuration")
10072 def transcripts_export( 10073 self, transcripts_table: str = None, param: dict = {} 10074 ) -> bool: 10075 """ """ 10076 10077 log.debug("Start transcripts export...") 10078 10079 # Param 10080 if not param: 10081 param = self.get_param() 10082 10083 # Param export 10084 param_transcript_export = param.get("transcripts", {}).get("export", {}) 10085 10086 # Output file 10087 transcripts_export_output = param_transcript_export.get("output", None) 10088 10089 if not param_transcript_export or not transcripts_export_output: 10090 log.warning(f"No transcriipts export parameters defined!") 10091 return False 10092 10093 # List of transcripts annotations 10094 query_describe = f""" 10095 SELECT column_name 10096 FROM ( 10097 DESCRIBE SELECT * FROM {transcripts_table} 10098 ) 10099 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10100 """ 10101 transcripts_annotations_list = list( 10102 self.get_query_to_df(query=query_describe)["column_name"] 10103 ) 10104 10105 # Create transcripts table for export 10106 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10107 random.choices(string.ascii_uppercase + string.digits, k=10) 10108 ) 10109 query_create_transcripts_table_export = f""" 10110 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10111 """ 10112 self.execute_query(query=query_create_transcripts_table_export) 10113 10114 # Output file format 10115 transcripts_export_output_format = get_file_format( 10116 filename=transcripts_export_output 10117 ) 10118 10119 # Format VCF - construct INFO 10120 if transcripts_export_output_format in ["vcf"]: 10121 10122 # Construct query update INFO and header 10123 query_update_info = [] 10124 for field in transcripts_annotations_list: 10125 10126 # If field not in header 10127 if field not in self.get_header_infos_list(): 10128 10129 # Add PZ Transcript in header 10130 self.get_header().infos[field] = vcf.parser._Info( 10131 field, 10132 ".", 10133 "String", 10134 f"Annotation '{field}' from transcript view", 10135 "unknown", 10136 "unknown", 10137 0, 10138 ) 10139 10140 # Add field as INFO/tag 10141 query_update_info.append( 10142 f""" 10143 CASE 10144 WHEN "{field}" IS NOT NULL 10145 THEN concat('{field}=', "{field}", ';') 10146 ELSE '' 10147 END 10148 """ 10149 ) 10150 10151 # Query param 10152 query_update_info_value = ( 10153 f""" concat('', {", ".join(query_update_info)}) """ 10154 ) 10155 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10156 10157 else: 10158 10159 # Query param 10160 query_update_info_value = f""" NULL """ 10161 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10162 10163 # Update query INFO column 10164 query_update = f""" 10165 UPDATE {transcripts_table_export} 10166 SET INFO = {query_update_info_value} 10167 10168 """ 10169 self.execute_query(query=query_update) 10170 10171 # Export 10172 self.export_output( 10173 output_file=transcripts_export_output, 10174 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10175 ) 10176 10177 # Drop transcripts export table 10178 query_drop_transcripts_table_export = f""" 10179 DROP TABLE {transcripts_table_export} 10180 """ 10181 self.execute_query(query=query_drop_transcripts_table_export)
10183 def transcripts_prioritization( 10184 self, transcripts_table: str = None, param: dict = {} 10185 ) -> bool: 10186 """ 10187 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10188 and updates the variants table with the prioritized information. 10189 10190 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10191 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10192 This parameter is used to identify the table where the transcripts data is stored for the 10193 prioritization process 10194 :type transcripts_table: str 10195 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10196 that contains various configuration settings for the prioritization process of transcripts. It 10197 is used to customize the behavior of the prioritization algorithm and includes settings such as 10198 the prefix for prioritization fields, default profiles, and other 10199 :type param: dict 10200 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10201 transcripts prioritization process is successfully completed, and `False` if there are any 10202 issues or if no profile is defined for transcripts prioritization. 10203 """ 10204 10205 log.debug("Start transcripts prioritization...") 10206 10207 # Param 10208 if not param: 10209 param = self.get_param() 10210 10211 # Variants table 10212 table_variants = self.get_table_variants() 10213 10214 # Transcripts table 10215 if transcripts_table is None: 10216 transcripts_table = self.create_transcript_view( 10217 transcripts_table="transcripts", param=param 10218 ) 10219 if transcripts_table is None: 10220 msg_err = "No Transcripts table availalble" 10221 log.error(msg_err) 10222 raise ValueError(msg_err) 10223 log.debug(f"transcripts_table={transcripts_table}") 10224 10225 # Get transcripts columns 10226 columns_as_list_query = f""" 10227 DESCRIBE {transcripts_table} 10228 """ 10229 columns_as_list = list( 10230 self.get_query_to_df(columns_as_list_query)["column_name"] 10231 ) 10232 10233 # Create INFO if not exists 10234 if "INFO" not in columns_as_list: 10235 query_add_info = f""" 10236 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10237 """ 10238 self.execute_query(query_add_info) 10239 10240 # Prioritization param and Force only PZ Score and Flag 10241 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10242 10243 # PZ profile by default 10244 pz_profile_default = ( 10245 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10246 ) 10247 10248 # Exit if no profile 10249 if pz_profile_default is None: 10250 log.warning("No profile defined for transcripts prioritization") 10251 return False 10252 10253 # PZ fields 10254 pz_param_pzfields = {} 10255 10256 # PZ field transcripts 10257 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10258 10259 # Add PZ Transcript in header 10260 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10261 pz_fields_transcripts, 10262 ".", 10263 "String", 10264 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10265 "unknown", 10266 "unknown", 10267 code_type_map["String"], 10268 ) 10269 10270 # Mandatory fields 10271 pz_mandatory_fields_list = [ 10272 "Score", 10273 "Flag", 10274 "Tags", 10275 "Comment", 10276 "Infos", 10277 "Class", 10278 ] 10279 pz_mandatory_fields = [] 10280 for pz_mandatory_field in pz_mandatory_fields_list: 10281 pz_mandatory_fields.append( 10282 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10283 ) 10284 10285 # PZ fields in param 10286 for pz_field in pz_param.get("pzfields", []): 10287 if pz_field in pz_mandatory_fields_list: 10288 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10289 pz_param.get("pzprefix", "PTZ") + pz_field 10290 ) 10291 else: 10292 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10293 pz_param_pzfields[pz_field] = pz_field_new 10294 10295 # Add PZ Transcript in header 10296 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10297 pz_field_new, 10298 ".", 10299 "String", 10300 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10301 "unknown", 10302 "unknown", 10303 code_type_map["String"], 10304 ) 10305 10306 # PZ fields param 10307 pz_param["pzfields"] = pz_mandatory_fields 10308 10309 # Prioritization 10310 prioritization_result = self.prioritization( 10311 table=transcripts_table, 10312 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10313 ) 10314 if not prioritization_result: 10315 log.warning("Transcripts prioritization not processed") 10316 return False 10317 10318 # PZ fields sql query 10319 query_update_select_list = [] 10320 query_update_concat_list = [] 10321 query_update_order_list = [] 10322 for pz_param_pzfield in set( 10323 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10324 ): 10325 query_update_select_list.append(f" {pz_param_pzfield}, ") 10326 10327 for pz_param_pzfield in pz_param_pzfields: 10328 query_update_concat_list.append( 10329 f""" 10330 , CASE 10331 WHEN {pz_param_pzfield} IS NOT NULL 10332 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10333 ELSE '' 10334 END 10335 """ 10336 ) 10337 10338 # Order by 10339 pz_orders = ( 10340 param.get("transcripts", {}) 10341 .get("prioritization", {}) 10342 .get("prioritization_transcripts_order", {}) 10343 ) 10344 if not pz_orders: 10345 pz_orders = { 10346 pz_param.get("pzprefix", "PTZ") + "Flag": "DESC", 10347 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10348 } 10349 for pz_order in pz_orders: 10350 query_update_order_list.append( 10351 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10352 ) 10353 10354 # Fields to explode 10355 fields_to_explode = ( 10356 list(pz_param_pzfields.keys()) 10357 + pz_mandatory_fields 10358 + list(pz_orders.keys()) 10359 ) 10360 # Remove transcript column as a specific transcript column 10361 if "transcript" in fields_to_explode: 10362 fields_to_explode.remove("transcript") 10363 10364 # Fields intranscripts table 10365 query_transcripts_table = f""" 10366 DESCRIBE SELECT * FROM {transcripts_table} 10367 """ 10368 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10369 10370 # Check fields to explode 10371 for field_to_explode in fields_to_explode: 10372 if field_to_explode not in self.get_header_infos_list() + list( 10373 query_transcripts_table.column_name 10374 ): 10375 msg_err = f"INFO/{field_to_explode} NOT IN header" 10376 log.error(msg_err) 10377 raise ValueError(msg_err) 10378 10379 # Explode fields to explode 10380 self.explode_infos( 10381 table=transcripts_table, 10382 fields=fields_to_explode, 10383 ) 10384 10385 # Transcript preference file 10386 transcripts_preference_file = ( 10387 param.get("transcripts", {}) 10388 .get("prioritization", {}) 10389 .get("prioritization_transcripts", {}) 10390 ) 10391 transcripts_preference_file = full_path(transcripts_preference_file) 10392 10393 # Transcript preference forced 10394 transcript_preference_force = ( 10395 param.get("transcripts", {}) 10396 .get("prioritization", {}) 10397 .get("prioritization_transcripts_force", False) 10398 ) 10399 # Transcript version forced 10400 transcript_version_force = ( 10401 param.get("transcripts", {}) 10402 .get("prioritization", {}) 10403 .get("prioritization_transcripts_version_force", False) 10404 ) 10405 10406 # Transcripts Ranking 10407 if transcripts_preference_file: 10408 10409 # Transcripts file to dataframe 10410 if os.path.exists(transcripts_preference_file): 10411 transcripts_preference_dataframe = transcripts_file_to_df( 10412 transcripts_preference_file 10413 ) 10414 else: 10415 log.error( 10416 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10417 ) 10418 raise ValueError( 10419 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10420 ) 10421 10422 # Order by depending to transcript preference forcing 10423 if transcript_preference_force: 10424 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10425 else: 10426 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10427 10428 # Transcript columns joined depend on version consideration 10429 if transcript_version_force: 10430 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10431 else: 10432 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10433 10434 # Query ranking for update 10435 query_update_ranking = f""" 10436 SELECT 10437 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10438 ROW_NUMBER() OVER ( 10439 PARTITION BY "#CHROM", POS, REF, ALT 10440 ORDER BY {order_by} 10441 ) AS rn 10442 FROM {transcripts_table} 10443 LEFT JOIN 10444 ( 10445 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10446 FROM transcripts_preference_dataframe 10447 ) AS transcripts_preference 10448 ON {transcripts_version_join} 10449 """ 10450 10451 else: 10452 10453 # Query ranking for update 10454 query_update_ranking = f""" 10455 SELECT 10456 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10457 ROW_NUMBER() OVER ( 10458 PARTITION BY "#CHROM", POS, REF, ALT 10459 ORDER BY {" , ".join(query_update_order_list)} 10460 ) AS rn 10461 FROM {transcripts_table} 10462 """ 10463 10464 # Export Transcripts prioritization infos to variants table 10465 query_update = f""" 10466 WITH RankedTranscripts AS ( 10467 {query_update_ranking} 10468 ) 10469 UPDATE {table_variants} 10470 SET 10471 INFO = CONCAT(CASE 10472 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10473 THEN '' 10474 ELSE concat("INFO", ';') 10475 END, 10476 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10477 ) 10478 FROM 10479 RankedTranscripts 10480 WHERE 10481 rn = 1 10482 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10483 AND variants."POS" = RankedTranscripts."POS" 10484 AND variants."REF" = RankedTranscripts."REF" 10485 AND variants."ALT" = RankedTranscripts."ALT" 10486 """ 10487 10488 # log.debug(f"query_update={query_update}") 10489 self.execute_query(query=query_update) 10490 10491 # Return 10492 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10494 def create_transcript_view_from_columns_map( 10495 self, 10496 transcripts_table: str = "transcripts", 10497 columns_maps: dict = {}, 10498 added_columns: list = [], 10499 temporary_tables: list = None, 10500 annotation_fields: list = None, 10501 column_rename: dict = {}, 10502 column_clean: bool = False, 10503 column_case: str = None, 10504 ) -> tuple[list, list, list]: 10505 """ 10506 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10507 specified columns mapping for transcripts data. 10508 10509 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10510 of the table where the transcripts data is stored or will be stored in the database. This table 10511 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10512 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10513 :type transcripts_table: str (optional) 10514 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10515 about how to map columns from a transcripts table to create a view. Each entry in the 10516 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10517 typically includes details such as the main transcript column and additional information columns 10518 :type columns_maps: dict 10519 :param added_columns: The `added_columns` parameter in the 10520 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10521 that will be added to the view being created based on the columns map provided. These columns 10522 are generated by exploding the transcript information columns along with the main transcript 10523 column 10524 :type added_columns: list 10525 :param temporary_tables: The `temporary_tables` parameter in the 10526 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10527 tables created during the process of creating a transcript view from a columns map. These 10528 temporary tables are used to store intermediate results or transformations before the final view 10529 is generated 10530 :type temporary_tables: list 10531 :param annotation_fields: The `annotation_fields` parameter in the 10532 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10533 used for annotation in the query view creation process. These fields are extracted from the 10534 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10535 :type annotation_fields: list 10536 :param column_rename: The `column_rename` parameter in the 10537 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10538 custom renaming for columns during the creation of the temporary table view. This parameter 10539 provides a mapping of original column names to the desired renamed column names. By using this 10540 parameter, 10541 :type column_rename: dict 10542 :param column_clean: The `column_clean` parameter in the 10543 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10544 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10545 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10546 False 10547 :type column_clean: bool (optional) 10548 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10549 function is used to specify the case transformation to be applied to the columns during the view 10550 creation process. It allows you to control whether the column values should be converted to 10551 lowercase, uppercase, or remain unchanged 10552 :type column_case: str 10553 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10554 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10555 """ 10556 10557 log.debug("Start transcrpts view creation from columns map...") 10558 10559 # "from_columns_map": [ 10560 # { 10561 # "transcripts_column": "Ensembl_transcriptid", 10562 # "transcripts_infos_columns": [ 10563 # "genename", 10564 # "Ensembl_geneid", 10565 # "LIST_S2_score", 10566 # "LIST_S2_pred", 10567 # ], 10568 # }, 10569 # { 10570 # "transcripts_column": "Ensembl_transcriptid", 10571 # "transcripts_infos_columns": [ 10572 # "genename", 10573 # "VARITY_R_score", 10574 # "Aloft_pred", 10575 # ], 10576 # }, 10577 # ], 10578 10579 # Init 10580 if temporary_tables is None: 10581 temporary_tables = [] 10582 if annotation_fields is None: 10583 annotation_fields = [] 10584 10585 # Variants table 10586 table_variants = self.get_table_variants() 10587 10588 for columns_map in columns_maps: 10589 10590 # Transcript column 10591 transcripts_column = columns_map.get("transcripts_column", None) 10592 10593 # Transcripts infos columns 10594 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10595 10596 # Transcripts infos columns rename 10597 column_rename = columns_map.get("column_rename", column_rename) 10598 10599 # Transcripts infos columns clean 10600 column_clean = columns_map.get("column_clean", column_clean) 10601 10602 # Transcripts infos columns case 10603 column_case = columns_map.get("column_case", column_case) 10604 10605 if transcripts_column is not None: 10606 10607 # Explode 10608 added_columns += self.explode_infos( 10609 fields=[transcripts_column] + transcripts_infos_columns 10610 ) 10611 10612 # View clauses 10613 clause_select_variants = [] 10614 clause_select_tanscripts = [] 10615 for field in [transcripts_column] + transcripts_infos_columns: 10616 10617 # AS field 10618 as_field = field 10619 10620 # Rename 10621 if column_rename: 10622 as_field = column_rename.get(as_field, as_field) 10623 10624 # Clean 10625 if column_clean: 10626 as_field = clean_annotation_field(as_field) 10627 10628 # Case 10629 if column_case: 10630 if column_case.lower() in ["lower"]: 10631 as_field = as_field.lower() 10632 elif column_case.lower() in ["upper"]: 10633 as_field = as_field.upper() 10634 10635 # Clause select Variants 10636 clause_select_variants.append( 10637 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10638 ) 10639 10640 if field in [transcripts_column]: 10641 clause_select_tanscripts.append( 10642 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10643 ) 10644 else: 10645 clause_select_tanscripts.append( 10646 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10647 ) 10648 annotation_fields.append(as_field) 10649 10650 # Querey View 10651 query = f""" 10652 SELECT 10653 "#CHROM", POS, REF, ALT, INFO, 10654 "{transcripts_column}" AS 'transcript', 10655 {", ".join(clause_select_tanscripts)} 10656 FROM ( 10657 SELECT 10658 "#CHROM", POS, REF, ALT, INFO, 10659 {", ".join(clause_select_variants)} 10660 FROM {table_variants} 10661 ) 10662 WHERE "{transcripts_column}" IS NOT NULL 10663 """ 10664 10665 # Create temporary table 10666 temporary_table = transcripts_table + "".join( 10667 random.choices(string.ascii_uppercase + string.digits, k=10) 10668 ) 10669 10670 # Temporary_tables 10671 temporary_tables.append(temporary_table) 10672 query_view = f""" 10673 CREATE TEMPORARY TABLE {temporary_table} 10674 AS ({query}) 10675 """ 10676 self.execute_query(query=query_view) 10677 10678 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10680 def create_transcript_view_from_column_format( 10681 self, 10682 transcripts_table: str = "transcripts", 10683 column_formats: dict = {}, 10684 temporary_tables: list = None, 10685 annotation_fields: list = None, 10686 column_rename: dict = {}, 10687 column_clean: bool = False, 10688 column_case: str = None, 10689 ) -> tuple[list, list, list]: 10690 """ 10691 The `create_transcript_view_from_column_format` function generates a transcript view based on 10692 specified column formats, adds additional columns and annotation fields, and returns the list of 10693 temporary tables and annotation fields. 10694 10695 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10696 of the table containing the transcripts data. This table will be used as the base table for 10697 creating the transcript view. The default value for this parameter is "transcripts", but you can 10698 provide a different table name if needed, defaults to transcripts 10699 :type transcripts_table: str (optional) 10700 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10701 about the columns to be used for creating the transcript view. Each entry in the dictionary 10702 specifies the mapping between a transcripts column and a transcripts infos column. This 10703 parameter allows you to define how the columns from the transcripts table should be transformed 10704 or mapped 10705 :type column_formats: dict 10706 :param temporary_tables: The `temporary_tables` parameter in the 10707 `create_transcript_view_from_column_format` function is a list that stores the names of 10708 temporary views created during the process of creating a transcript view from a column format. 10709 These temporary views are used to manipulate and extract data before generating the final 10710 transcript view 10711 :type temporary_tables: list 10712 :param annotation_fields: The `annotation_fields` parameter in the 10713 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10714 that are extracted from the temporary views created during the process. These annotation fields 10715 are obtained by querying the temporary views and extracting the column names excluding specific 10716 columns like `#CH 10717 :type annotation_fields: list 10718 :param column_rename: The `column_rename` parameter in the 10719 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10720 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10721 column names to new column names in this dictionary, you can rename specific columns during the 10722 process 10723 :type column_rename: dict 10724 :param column_clean: The `column_clean` parameter in the 10725 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10726 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10727 will be cleaned during the creation of the transcript view based on the specified column format, 10728 defaults to False 10729 :type column_clean: bool (optional) 10730 :param column_case: The `column_case` parameter in the 10731 `create_transcript_view_from_column_format` function is used to specify the case transformation 10732 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10733 to convert the column names to uppercase or lowercase, respectively 10734 :type column_case: str 10735 :return: The `create_transcript_view_from_column_format` function returns two lists: 10736 `temporary_tables` and `annotation_fields`. 10737 """ 10738 10739 log.debug("Start transcrpts view creation from column format...") 10740 10741 # "from_column_format": [ 10742 # { 10743 # "transcripts_column": "ANN", 10744 # "transcripts_infos_column": "Feature_ID", 10745 # } 10746 # ], 10747 10748 # Init 10749 if temporary_tables is None: 10750 temporary_tables = [] 10751 if annotation_fields is None: 10752 annotation_fields = [] 10753 10754 for column_format in column_formats: 10755 10756 # annotation field and transcript annotation field 10757 annotation_field = column_format.get("transcripts_column", "ANN") 10758 transcript_annotation = column_format.get( 10759 "transcripts_infos_column", "Feature_ID" 10760 ) 10761 10762 # Transcripts infos columns rename 10763 column_rename = column_format.get("column_rename", column_rename) 10764 10765 # Transcripts infos columns clean 10766 column_clean = column_format.get("column_clean", column_clean) 10767 10768 # Transcripts infos columns case 10769 column_case = column_format.get("column_case", column_case) 10770 10771 # Temporary View name 10772 temporary_view_name = transcripts_table + "".join( 10773 random.choices(string.ascii_uppercase + string.digits, k=10) 10774 ) 10775 10776 # Create temporary view name 10777 temporary_view_name = self.annotation_format_to_table( 10778 uniquify=True, 10779 annotation_field=annotation_field, 10780 view_name=temporary_view_name, 10781 annotation_id=transcript_annotation, 10782 column_rename=column_rename, 10783 column_clean=column_clean, 10784 column_case=column_case, 10785 ) 10786 10787 # Annotation fields 10788 if temporary_view_name: 10789 query_annotation_fields = f""" 10790 SELECT * 10791 FROM ( 10792 DESCRIBE SELECT * 10793 FROM {temporary_view_name} 10794 ) 10795 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10796 """ 10797 df_annotation_fields = self.get_query_to_df( 10798 query=query_annotation_fields 10799 ) 10800 10801 # Add temporary view and annotation fields 10802 temporary_tables.append(temporary_view_name) 10803 annotation_fields += list(set(df_annotation_fields["column_name"])) 10804 10805 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10807 def create_transcript_view( 10808 self, 10809 transcripts_table: str = None, 10810 transcripts_table_drop: bool = False, 10811 param: dict = {}, 10812 ) -> str: 10813 """ 10814 The `create_transcript_view` function generates a transcript view by processing data from a 10815 specified table based on provided parameters and structural information. 10816 10817 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10818 is used to specify the name of the table that will store the final transcript view data. If a table 10819 name is not provided, the function will create a new table to store the transcript view data, and by 10820 default,, defaults to transcripts 10821 :type transcripts_table: str (optional) 10822 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10823 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10824 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10825 the function will drop the existing transcripts table if it exists, defaults to False 10826 :type transcripts_table_drop: bool (optional) 10827 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10828 contains information needed to create a transcript view. It includes details such as the structure 10829 of the transcripts, columns mapping, column formats, and other necessary information for generating 10830 the view. This parameter allows for flexibility and customization 10831 :type param: dict 10832 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10833 created or modified during the execution of the function. 10834 """ 10835 10836 log.debug("Start transcripts view creation...") 10837 10838 # Default 10839 transcripts_table_default = "transcripts" 10840 10841 # Param 10842 if not param: 10843 param = self.get_param() 10844 10845 # Struct 10846 struct = param.get("transcripts", {}).get("struct", None) 10847 10848 # Transcript veresion 10849 transcript_id_remove_version = param.get("transcripts", {}).get( 10850 "transcript_id_remove_version", False 10851 ) 10852 10853 # Transcripts mapping 10854 transcript_id_mapping_file = param.get("transcripts", {}).get( 10855 "transcript_id_mapping_file", None 10856 ) 10857 10858 # Transcripts mapping 10859 transcript_id_mapping_force = param.get("transcripts", {}).get( 10860 "transcript_id_mapping_force", None 10861 ) 10862 10863 if struct: 10864 10865 # Transcripts table 10866 if transcripts_table is None: 10867 transcripts_table = param.get("transcripts", {}).get( 10868 "table", transcripts_table_default 10869 ) 10870 10871 # added_columns 10872 added_columns = [] 10873 10874 # Temporary tables 10875 temporary_tables = [] 10876 10877 # Annotation fields 10878 annotation_fields = [] 10879 10880 # from columns map 10881 columns_maps = struct.get("from_columns_map", []) 10882 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10883 self.create_transcript_view_from_columns_map( 10884 transcripts_table=transcripts_table, 10885 columns_maps=columns_maps, 10886 added_columns=added_columns, 10887 temporary_tables=temporary_tables, 10888 annotation_fields=annotation_fields, 10889 ) 10890 ) 10891 added_columns += added_columns_tmp 10892 temporary_tables += temporary_tables_tmp 10893 annotation_fields += annotation_fields_tmp 10894 10895 # from column format 10896 column_formats = struct.get("from_column_format", []) 10897 temporary_tables_tmp, annotation_fields_tmp = ( 10898 self.create_transcript_view_from_column_format( 10899 transcripts_table=transcripts_table, 10900 column_formats=column_formats, 10901 temporary_tables=temporary_tables, 10902 annotation_fields=annotation_fields, 10903 ) 10904 ) 10905 temporary_tables += temporary_tables_tmp 10906 annotation_fields += annotation_fields_tmp 10907 10908 # Remove some specific fields/column 10909 annotation_fields = list(set(annotation_fields)) 10910 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10911 if field in annotation_fields: 10912 annotation_fields.remove(field) 10913 10914 # Merge temporary tables query 10915 query_merge = "" 10916 for temporary_table in list(set(temporary_tables)): 10917 10918 # First temporary table 10919 if not query_merge: 10920 query_merge = f""" 10921 SELECT * FROM {temporary_table} 10922 """ 10923 # other temporary table (using UNION) 10924 else: 10925 query_merge += f""" 10926 UNION BY NAME SELECT * FROM {temporary_table} 10927 """ 10928 10929 # transcript table tmp 10930 transcript_table_tmp = "transcripts_tmp" 10931 transcript_table_tmp2 = "transcripts_tmp2" 10932 transcript_table_tmp3 = "transcripts_tmp3" 10933 10934 # Merge on transcript 10935 query_merge_on_transcripts_annotation_fields = [] 10936 10937 # Add transcript list 10938 query_merge_on_transcripts_annotation_fields.append( 10939 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10940 ) 10941 10942 # Aggregate all annotations fields 10943 for annotation_field in set(annotation_fields): 10944 query_merge_on_transcripts_annotation_fields.append( 10945 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10946 ) 10947 10948 # Transcripts mapping 10949 if transcript_id_mapping_file: 10950 10951 # Transcript dataframe 10952 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10953 transcript_id_mapping_dataframe = transcripts_file_to_df( 10954 transcript_id_mapping_file, column_names=["transcript", "alias"] 10955 ) 10956 10957 # Transcript version remove 10958 if transcript_id_remove_version: 10959 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10960 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10961 query_left_join = f""" 10962 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10963 """ 10964 else: 10965 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10966 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10967 query_left_join = f""" 10968 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10969 """ 10970 10971 # Transcript column for group by merge 10972 query_transcript_merge_group_by = """ 10973 CASE 10974 WHEN transcript_mapped NOT IN ('') 10975 THEN split_part(transcript_mapped, '.', 1) 10976 ELSE split_part(transcript_original, '.', 1) 10977 END 10978 """ 10979 10980 # Merge query 10981 transcripts_tmp2_query = f""" 10982 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10983 FROM ({query_merge}) AS {transcript_table_tmp} 10984 {query_left_join} 10985 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10986 """ 10987 10988 # Retrive columns after mege 10989 transcripts_tmp2_describe_query = f""" 10990 DESCRIBE {transcripts_tmp2_query} 10991 """ 10992 transcripts_tmp2_describe_list = list( 10993 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10994 "column_name" 10995 ] 10996 ) 10997 10998 # Create list of columns for select clause 10999 transcripts_tmp2_describe_select_clause = [] 11000 for field in transcripts_tmp2_describe_list: 11001 if field not in [ 11002 "#CHROM", 11003 "POS", 11004 "REF", 11005 "ALT", 11006 "INFO", 11007 "transcript_mapped", 11008 ]: 11009 as_field = field 11010 if field in ["transcript_original"]: 11011 as_field = "transcripts_mapped" 11012 transcripts_tmp2_describe_select_clause.append( 11013 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 11014 ) 11015 11016 # Merge with mapping 11017 query_merge_on_transcripts = f""" 11018 SELECT 11019 "#CHROM", POS, REF, ALT, INFO, 11020 CASE 11021 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 11022 THEN ANY_VALUE(transcript_mapped) 11023 ELSE ANY_VALUE(transcript_original) 11024 END AS transcript, 11025 {", ".join(transcripts_tmp2_describe_select_clause)} 11026 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 11027 GROUP BY "#CHROM", POS, REF, ALT, INFO, 11028 {query_transcript_merge_group_by} 11029 """ 11030 11031 # Add transcript filter from mapping file 11032 if transcript_id_mapping_force: 11033 query_merge_on_transcripts = f""" 11034 SELECT * 11035 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 11036 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 11037 """ 11038 11039 # No transcript mapping 11040 else: 11041 11042 # Remove transcript version 11043 if transcript_id_remove_version: 11044 query_transcript_column = f""" 11045 split_part({transcript_table_tmp}.transcript, '.', 1) 11046 """ 11047 else: 11048 query_transcript_column = """ 11049 transcript 11050 """ 11051 11052 # Query sections 11053 query_transcript_column_select = ( 11054 f"{query_transcript_column} AS transcript" 11055 ) 11056 query_transcript_column_group_by = query_transcript_column 11057 11058 # Query for transcripts view 11059 query_merge_on_transcripts = f""" 11060 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 11061 FROM ({query_merge}) AS {transcript_table_tmp} 11062 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 11063 """ 11064 11065 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 11066 11067 # Drop transcript view is necessary 11068 if transcripts_table_drop: 11069 query_drop = f""" 11070 DROP TABLE IF EXISTS {transcripts_table}; 11071 """ 11072 self.execute_query(query=query_drop) 11073 11074 # Merge and create transcript view 11075 query_create_view = f""" 11076 CREATE TABLE IF NOT EXISTS {transcripts_table} 11077 AS {query_merge_on_transcripts} 11078 """ 11079 self.execute_query(query=query_create_view) 11080 11081 # Remove added columns 11082 for added_column in added_columns: 11083 self.drop_column(column=added_column) 11084 11085 else: 11086 11087 transcripts_table = None 11088 11089 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to False - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11091 def annotation_format_to_table( 11092 self, 11093 uniquify: bool = True, 11094 annotation_field: str = "ANN", 11095 annotation_id: str = "Feature_ID", 11096 view_name: str = "transcripts", 11097 column_rename: dict = {}, 11098 column_clean: bool = False, 11099 column_case: str = None, 11100 ) -> str: 11101 """ 11102 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11103 structured table format, ensuring unique values and creating a temporary table for further 11104 processing or analysis. 11105 11106 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11107 unique values in the output or not. If set to `True`, the function will make sure that the 11108 output values are unique, defaults to True 11109 :type uniquify: bool (optional) 11110 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11111 that contains the annotation information for each variant. This field is used to extract the 11112 annotation details for further processing in the function. By default, it is set to "ANN", 11113 defaults to ANN 11114 :type annotation_field: str (optional) 11115 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11116 is used to specify the identifier for the annotation feature. This identifier will be used as a 11117 column name in the resulting table or view that is created based on the annotation data. It 11118 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11119 :type annotation_id: str (optional) 11120 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11121 to specify the name of the temporary table that will be created to store the transformed 11122 annotation data. This table will hold the extracted information from the annotation field in a 11123 structured format for further processing or analysis. By default,, defaults to transcripts 11124 :type view_name: str (optional) 11125 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11126 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11127 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11128 created based on the annotation data. This feature enables 11129 :type column_rename: dict 11130 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11131 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11132 If set to `True`, the function will clean the annotation field before further processing. This 11133 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11134 to False 11135 :type column_clean: bool (optional) 11136 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11137 used to specify the case transformation to be applied to the column names extracted from the 11138 annotation data. It allows you to set the case of the column names to either lowercase or 11139 uppercase for consistency or other specific requirements during the conversion 11140 :type column_case: str 11141 :return: The function `annotation_format_to_table` is returning the name of the view created, 11142 which is stored in the variable `view_name`. 11143 """ 11144 11145 # Annotation field 11146 annotation_format = "annotation_explode" 11147 11148 # Transcript annotation 11149 if column_rename: 11150 annotation_id = column_rename.get(annotation_id, annotation_id) 11151 11152 if column_clean: 11153 annotation_id = clean_annotation_field(annotation_id) 11154 11155 # Prefix 11156 prefix = self.get_explode_infos_prefix() 11157 if prefix: 11158 prefix = "INFO/" 11159 11160 # Annotation fields 11161 annotation_infos = prefix + annotation_field 11162 annotation_format_infos = prefix + annotation_format 11163 11164 # Variants table 11165 table_variants = self.get_table_variants() 11166 11167 # Header 11168 vcf_reader = self.get_header() 11169 11170 # Add columns 11171 added_columns = [] 11172 11173 # Explode HGVS field in column 11174 added_columns += self.explode_infos(fields=[annotation_field]) 11175 11176 if annotation_field in vcf_reader.infos: 11177 11178 # Extract ANN header 11179 ann_description = vcf_reader.infos[annotation_field].desc 11180 pattern = r"'(.+?)'" 11181 match = re.search(pattern, ann_description) 11182 if match: 11183 ann_header_match = match.group(1).split(" | ") 11184 ann_header = [] 11185 ann_header_desc = {} 11186 for i in range(len(ann_header_match)): 11187 ann_header_info = "".join( 11188 char for char in ann_header_match[i] if char.isalnum() 11189 ) 11190 ann_header.append(ann_header_info) 11191 ann_header_desc[ann_header_info] = ann_header_match[i] 11192 if not ann_header_desc: 11193 raise ValueError("Invalid header description format") 11194 else: 11195 raise ValueError("Invalid header description format") 11196 11197 # Create variant id 11198 variant_id_column = self.get_variant_id_column() 11199 added_columns += [variant_id_column] 11200 11201 # Create dataframe 11202 dataframe_annotation_format = self.get_query_to_df( 11203 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11204 ) 11205 11206 # Create annotation columns 11207 dataframe_annotation_format[ 11208 annotation_format_infos 11209 ] = dataframe_annotation_format[annotation_infos].apply( 11210 lambda x: explode_annotation_format( 11211 annotation=str(x), 11212 uniquify=uniquify, 11213 output_format="JSON", 11214 prefix="", 11215 header=list(ann_header_desc.values()), 11216 ) 11217 ) 11218 11219 # Find keys 11220 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11221 df_keys = self.get_query_to_df(query=query_json) 11222 11223 # Check keys 11224 query_json_key = [] 11225 for _, row in df_keys.iterrows(): 11226 11227 # Key 11228 key = row.iloc[0] 11229 key_clean = key 11230 11231 # key rename 11232 if column_rename: 11233 key_clean = column_rename.get(key_clean, key_clean) 11234 11235 # key clean 11236 if column_clean: 11237 key_clean = clean_annotation_field(key_clean) 11238 11239 # Key case 11240 if column_case: 11241 if column_case.lower() in ["lower"]: 11242 key_clean = key_clean.lower() 11243 elif column_case.lower() in ["upper"]: 11244 key_clean = key_clean.upper() 11245 11246 # Type 11247 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11248 11249 # Get DataFrame from query 11250 df_json_type = self.get_query_to_df(query=query_json_type) 11251 11252 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11253 with pd.option_context("future.no_silent_downcasting", True): 11254 df_json_type.fillna(value="", inplace=True) 11255 replace_dict = {None: np.nan, "": np.nan} 11256 df_json_type.replace(replace_dict, inplace=True) 11257 df_json_type.dropna(inplace=True) 11258 11259 # Detect column type 11260 column_type = detect_column_type(df_json_type[key_clean]) 11261 11262 # Append 11263 query_json_key.append( 11264 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11265 ) 11266 11267 # Create view 11268 query_view = f""" 11269 CREATE TEMPORARY TABLE {view_name} 11270 AS ( 11271 SELECT *, {annotation_id} AS 'transcript' 11272 FROM ( 11273 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11274 FROM dataframe_annotation_format 11275 ) 11276 ); 11277 """ 11278 self.execute_query(query=query_view) 11279 11280 else: 11281 11282 # Return None 11283 view_name = None 11284 11285 # Remove added columns 11286 for added_column in added_columns: 11287 self.drop_column(column=added_column) 11288 11289 return view_name
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11291 def transcript_view_to_variants( 11292 self, 11293 transcripts_table: str = None, 11294 transcripts_column_id: str = None, 11295 transcripts_info_json: str = None, 11296 transcripts_info_field_json: str = None, 11297 transcripts_info_format: str = None, 11298 transcripts_info_field_format: str = None, 11299 param: dict = {}, 11300 ) -> bool: 11301 """ 11302 The `transcript_view_to_variants` function updates a variants table with information from 11303 transcripts in JSON format. 11304 11305 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11306 table containing the transcripts data. If this parameter is not provided, the function will 11307 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11308 :type transcripts_table: str 11309 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11310 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11311 identifier is used to match transcripts with variants in the database 11312 :type transcripts_column_id: str 11313 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11314 of the column in the variants table where the transcripts information will be stored in JSON 11315 format. This parameter allows you to define the column in the variants table that will hold the 11316 JSON-formatted information about transcripts 11317 :type transcripts_info_json: str 11318 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11319 specify the field in the VCF header that will contain information about transcripts in JSON 11320 format. This field will be added to the VCF header as an INFO field with the specified name 11321 :type transcripts_info_field_json: str 11322 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11323 format of the information about transcripts that will be stored in the variants table. This 11324 format can be used to define how the transcript information will be structured or displayed 11325 within the variants table 11326 :type transcripts_info_format: str 11327 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11328 specify the field in the VCF header that will contain information about transcripts in a 11329 specific format. This field will be added to the VCF header as an INFO field with the specified 11330 name 11331 :type transcripts_info_field_format: str 11332 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11333 that contains various configuration settings related to transcripts. It is used to provide 11334 default values for certain parameters if they are not explicitly provided when calling the 11335 method. The `param` dictionary can be passed as an argument 11336 :type param: dict 11337 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11338 if the operation is successful and `False` if certain conditions are not met. 11339 """ 11340 11341 msg_info_prefix = "Start transcripts view to variants annotations" 11342 11343 log.debug(f"{msg_info_prefix}...") 11344 11345 # Default 11346 transcripts_table_default = "transcripts" 11347 transcripts_column_id_default = "transcript" 11348 transcripts_info_json_default = None 11349 transcripts_info_format_default = None 11350 transcripts_info_field_json_default = None 11351 transcripts_info_field_format_default = None 11352 11353 # Param 11354 if not param: 11355 param = self.get_param() 11356 11357 # Transcripts table 11358 if transcripts_table is None: 11359 transcripts_table = param.get("transcripts", {}).get( 11360 "table", transcripts_table_default 11361 ) 11362 11363 # Transcripts column ID 11364 if transcripts_column_id is None: 11365 transcripts_column_id = param.get("transcripts", {}).get( 11366 "column_id", transcripts_column_id_default 11367 ) 11368 11369 # Transcripts info json 11370 if transcripts_info_json is None: 11371 transcripts_info_json = param.get("transcripts", {}).get( 11372 "transcripts_info_json", transcripts_info_json_default 11373 ) 11374 11375 # Transcripts info field JSON 11376 if transcripts_info_field_json is None: 11377 transcripts_info_field_json = param.get("transcripts", {}).get( 11378 "transcripts_info_field_json", transcripts_info_field_json_default 11379 ) 11380 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11381 # transcripts_info_json = transcripts_info_field_json 11382 11383 # Transcripts info format 11384 if transcripts_info_format is None: 11385 transcripts_info_format = param.get("transcripts", {}).get( 11386 "transcripts_info_format", transcripts_info_format_default 11387 ) 11388 11389 # Transcripts info field FORMAT 11390 if transcripts_info_field_format is None: 11391 transcripts_info_field_format = param.get("transcripts", {}).get( 11392 "transcripts_info_field_format", transcripts_info_field_format_default 11393 ) 11394 # if ( 11395 # transcripts_info_field_format is not None 11396 # and transcripts_info_format is None 11397 # ): 11398 # transcripts_info_format = transcripts_info_field_format 11399 11400 # Variants table 11401 table_variants = self.get_table_variants() 11402 11403 # Check info columns param 11404 if ( 11405 transcripts_info_json is None 11406 and transcripts_info_field_json is None 11407 and transcripts_info_format is None 11408 and transcripts_info_field_format is None 11409 ): 11410 return False 11411 11412 # Transcripts infos columns 11413 query_transcripts_infos_columns = f""" 11414 SELECT * 11415 FROM ( 11416 DESCRIBE SELECT * FROM {transcripts_table} 11417 ) 11418 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11419 """ 11420 transcripts_infos_columns = list( 11421 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11422 ) 11423 11424 # View results 11425 clause_select = [] 11426 clause_to_json = [] 11427 clause_to_format = [] 11428 for field in transcripts_infos_columns: 11429 # Do not consider INFO field for export into fields 11430 if field not in ["INFO"]: 11431 clause_select.append( 11432 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11433 ) 11434 clause_to_json.append(f""" '{field}': "{field}" """) 11435 clause_to_format.append(f""" "{field}" """) 11436 11437 # Update 11438 update_set_json = [] 11439 update_set_format = [] 11440 11441 # VCF header 11442 vcf_reader = self.get_header() 11443 11444 # Transcripts to info column in JSON 11445 if transcripts_info_json: 11446 11447 # Create column on variants table 11448 self.add_column( 11449 table_name=table_variants, 11450 column_name=transcripts_info_json, 11451 column_type="JSON", 11452 default_value=None, 11453 drop=False, 11454 ) 11455 11456 # Add header 11457 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11458 transcripts_info_json, 11459 ".", 11460 "String", 11461 "Transcripts in JSON format", 11462 "unknwon", 11463 "unknwon", 11464 self.code_type_map["String"], 11465 ) 11466 11467 # Add to update 11468 update_set_json.append( 11469 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11470 ) 11471 11472 # Transcripts to info field in JSON 11473 if transcripts_info_field_json: 11474 11475 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11476 11477 # Add to update 11478 update_set_json.append( 11479 f""" 11480 INFO = concat( 11481 CASE 11482 WHEN INFO NOT IN ('', '.') 11483 THEN INFO 11484 ELSE '' 11485 END, 11486 CASE 11487 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11488 THEN concat( 11489 ';{transcripts_info_field_json}=', 11490 t.{transcripts_info_json} 11491 ) 11492 ELSE '' 11493 END 11494 ) 11495 """ 11496 ) 11497 11498 # Add header 11499 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11500 transcripts_info_field_json, 11501 ".", 11502 "String", 11503 "Transcripts in JSON format", 11504 "unknwon", 11505 "unknwon", 11506 self.code_type_map["String"], 11507 ) 11508 11509 if update_set_json: 11510 11511 # Update query 11512 query_update = f""" 11513 UPDATE {table_variants} 11514 SET {", ".join(update_set_json)} 11515 FROM 11516 ( 11517 SELECT 11518 "#CHROM", POS, REF, ALT, 11519 concat( 11520 '{{', 11521 string_agg( 11522 '"' || "{transcripts_column_id}" || '":' || 11523 to_json(json_output) 11524 ), 11525 '}}' 11526 )::JSON AS {transcripts_info_json} 11527 FROM 11528 ( 11529 SELECT 11530 "#CHROM", POS, REF, ALT, 11531 "{transcripts_column_id}", 11532 to_json( 11533 {{{",".join(clause_to_json)}}} 11534 )::JSON AS json_output 11535 FROM 11536 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11537 WHERE "{transcripts_column_id}" IS NOT NULL 11538 ) 11539 GROUP BY "#CHROM", POS, REF, ALT 11540 ) AS t 11541 WHERE {table_variants}."#CHROM" = t."#CHROM" 11542 AND {table_variants}."POS" = t."POS" 11543 AND {table_variants}."REF" = t."REF" 11544 AND {table_variants}."ALT" = t."ALT" 11545 """ 11546 11547 self.execute_query(query=query_update) 11548 11549 # Transcripts to info column in FORMAT 11550 if transcripts_info_format: 11551 11552 # Create column on variants table 11553 self.add_column( 11554 table_name=table_variants, 11555 column_name=transcripts_info_format, 11556 column_type="VARCHAR", 11557 default_value=None, 11558 drop=False, 11559 ) 11560 11561 # Add header 11562 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11563 transcripts_info_format, 11564 ".", 11565 "String", 11566 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11567 "unknwon", 11568 "unknwon", 11569 self.code_type_map["String"], 11570 ) 11571 11572 # Add to update 11573 update_set_format.append( 11574 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11575 ) 11576 11577 else: 11578 11579 # Set variable for internal queries 11580 transcripts_info_format = "transcripts_info_format" 11581 11582 # Transcripts to info field in JSON 11583 if transcripts_info_field_format: 11584 11585 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11586 11587 # Add to update 11588 update_set_format.append( 11589 f""" 11590 INFO = concat( 11591 CASE 11592 WHEN INFO NOT IN ('', '.') 11593 THEN INFO 11594 ELSE '' 11595 END, 11596 CASE 11597 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11598 THEN concat( 11599 ';{transcripts_info_field_format}=', 11600 t.{transcripts_info_format} 11601 ) 11602 ELSE '' 11603 END 11604 ) 11605 """ 11606 ) 11607 11608 # Add header 11609 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11610 transcripts_info_field_format, 11611 ".", 11612 "String", 11613 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11614 "unknwon", 11615 "unknwon", 11616 self.code_type_map["String"], 11617 ) 11618 11619 if update_set_format: 11620 11621 # Update query 11622 query_update = f""" 11623 UPDATE {table_variants} 11624 SET {", ".join(update_set_format)} 11625 FROM 11626 ( 11627 SELECT 11628 "#CHROM", POS, REF, ALT, 11629 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11630 FROM 11631 ( 11632 SELECT 11633 "#CHROM", POS, REF, ALT, 11634 "{transcripts_column_id}", 11635 concat( 11636 "{transcripts_column_id}", 11637 '|', 11638 {", '|', ".join(clause_to_format)} 11639 ) AS {transcripts_info_format} 11640 FROM 11641 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11642 ) 11643 GROUP BY "#CHROM", POS, REF, ALT 11644 ) AS t 11645 WHERE {table_variants}."#CHROM" = t."#CHROM" 11646 AND {table_variants}."POS" = t."POS" 11647 AND {table_variants}."REF" = t."REF" 11648 AND {table_variants}."ALT" = t."ALT" 11649 """ 11650 11651 self.execute_query(query=query_update) 11652 11653 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.
11655 def rename_info_fields( 11656 self, fields_to_rename: dict = None, table: str = None 11657 ) -> dict: 11658 """ 11659 The `rename_info_fields` function renames specified fields in a VCF file header and updates 11660 corresponding INFO fields in the variants table. 11661 11662 :param fields_to_rename: The `fields_to_rename` parameter is a dictionary that contains the 11663 mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary 11664 represent the original field names that need to be renamed, and the corresponding values 11665 represent the new names to which the fields should be 11666 :type fields_to_rename: dict 11667 :param table: The `table` parameter in the `rename_info_fields` function represents the name of 11668 the table in which the variants data is stored. This table contains information about genetic 11669 variants, and the function updates the corresponding INFO fields in this table when renaming 11670 specified fields in the VCF file header 11671 :type table: str 11672 :return: The `rename_info_fields` function returns a dictionary `fields_renamed` that contains 11673 the original field names as keys and their corresponding new names (or None if the field was 11674 removed) as values after renaming or removing specified fields in a VCF file header and updating 11675 corresponding INFO fields in the variants table. 11676 """ 11677 11678 # Init 11679 fields_renamed = {} 11680 config = self.get_config() 11681 access = config.get("access") 11682 11683 if table is None: 11684 table = self.get_table_variants() 11685 11686 # regexp replace fonction 11687 regex_replace_dict = {} 11688 regex_replace_nb = 0 11689 regex_replace_partition = 125 11690 regex_replace = "INFO" 11691 11692 if fields_to_rename is not None and access not in ["RO"]: 11693 11694 log.info("Rename or remove fields...") 11695 11696 # Header 11697 header = self.get_header() 11698 11699 for field_to_rename, field_renamed in fields_to_rename.items(): 11700 11701 if field_to_rename in header.infos: 11702 11703 # Rename header 11704 if field_renamed is not None: 11705 header.infos[field_renamed] = vcf.parser._Info( 11706 field_renamed, 11707 header.infos[field_to_rename].num, 11708 header.infos[field_to_rename].type, 11709 header.infos[field_to_rename].desc, 11710 header.infos[field_to_rename].source, 11711 header.infos[field_to_rename].version, 11712 header.infos[field_to_rename].type_code, 11713 ) 11714 del header.infos[field_to_rename] 11715 11716 # Rename INFO patterns 11717 field_pattern = rf'(^|;)({field_to_rename})($|;|=[^;]*)' 11718 if field_renamed is not None: 11719 field_renamed_pattern = rf'\1{field_renamed}\3' 11720 else: 11721 field_renamed_pattern = '' 11722 11723 # regexp replace 11724 regex_replace_nb += 1 11725 regex_replace_key = math.floor(regex_replace_nb / regex_replace_partition) 11726 if (regex_replace_nb % regex_replace_partition) == 0: 11727 regex_replace = "INFO" 11728 regex_replace = f"regexp_replace({regex_replace}, '{field_pattern}', '{field_renamed_pattern}')" 11729 regex_replace_dict[regex_replace_key] = regex_replace 11730 11731 # Return 11732 fields_renamed[field_to_rename] = field_renamed 11733 11734 # Log 11735 if field_renamed is not None: 11736 log.info(f"Rename or remove fields - field '{field_to_rename}' renamed to '{field_renamed}'") 11737 else: 11738 log.info(f"Rename or remove fields - field '{field_to_rename}' removed") 11739 11740 else: 11741 11742 log.warning(f"Rename or remove fields - field '{field_to_rename}' not in header") 11743 11744 11745 # Rename INFO 11746 for regex_replace_key, regex_replace in regex_replace_dict.items(): 11747 log.info(f"Rename or remove fields - Process [{regex_replace_key+1}/{len(regex_replace_dict)}]...") 11748 query = f""" 11749 UPDATE {table} 11750 SET 11751 INFO = {regex_replace} 11752 """ 11753 log.debug(f"query={query}") 11754 self.execute_query(query=query) 11755 11756 return fields_renamed
The rename_info_fields function renames specified fields in a VCF file header and updates
corresponding INFO fields in the variants table.
Parameters
- fields_to_rename: The
fields_to_renameparameter is a dictionary that contains the mapping of fields to be renamed in a VCF (Variant Call Format) file. The keys in the dictionary represent the original field names that need to be renamed, and the corresponding values represent the new names to which the fields should be - table: The
tableparameter in therename_info_fieldsfunction represents the name of the table in which the variants data is stored. This table contains information about genetic variants, and the function updates the corresponding INFO fields in this table when renaming specified fields in the VCF file header
Returns
The
rename_info_fieldsfunction returns a dictionaryfields_renamedthat contains the original field names as keys and their corresponding new names (or None if the field was removed) as values after renaming or removing specified fields in a VCF file header and updating corresponding INFO fields in the variants table.
11758 def calculation_rename_info_fields( 11759 self, 11760 fields_to_rename: dict = None, 11761 table: str = None, 11762 operation_name: str = "RENAME_INFO_FIELDS", 11763 ) -> None: 11764 """ 11765 The `calculation_rename_info_fields` function retrieves parameters from a dictionary, updates 11766 fields to rename and table if provided, and then calls another function to rename the fields. 11767 11768 :param fields_to_rename: `fields_to_rename` is a dictionary that contains the fields to be 11769 renamed in a table. Each key-value pair in the dictionary represents the original field name as 11770 the key and the new field name as the value 11771 :type fields_to_rename: dict 11772 :param table: The `table` parameter in the `calculation_rename_info_fields` method is used to 11773 specify the name of the table for which the fields are to be renamed. It is a string type 11774 parameter 11775 :type table: str 11776 :param operation_name: The `operation_name` parameter in the `calculation_rename_info_fields` 11777 method is a string that specifies the name of the operation being performed. In this context, it 11778 is used as a default value for the operation name if not explicitly provided when calling the 11779 function, defaults to RENAME_INFO_FIELDS 11780 :type operation_name: str (optional) 11781 """ 11782 11783 # Param 11784 param = self.get_param() 11785 11786 # Get param fields to rename 11787 param_fields_to_rename = ( 11788 param.get("calculation", {}) 11789 .get("calculations", {}) 11790 .get(operation_name, {}) 11791 .get("fields_to_rename", None) 11792 ) 11793 11794 # Get param table 11795 param_table = ( 11796 param.get("calculation", {}) 11797 .get("calculations", {}) 11798 .get(operation_name, {}) 11799 .get("table", None) 11800 ) 11801 11802 # Init fields_to_rename 11803 if fields_to_rename is None: 11804 fields_to_rename = param_fields_to_rename 11805 11806 # Init table 11807 if table is None: 11808 table = param_table 11809 11810 renamed_fields = self.rename_info_fields( 11811 fields_to_rename=fields_to_rename, table=table 11812 ) 11813 11814 log.debug(f"renamed_fields:{renamed_fields}")
The calculation_rename_info_fields function retrieves parameters from a dictionary, updates
fields to rename and table if provided, and then calls another function to rename the fields.
Parameters
- fields_to_rename:
fields_to_renameis a dictionary that contains the fields to be renamed in a table. Each key-value pair in the dictionary represents the original field name as the key and the new field name as the value - table: The
tableparameter in thecalculation_rename_info_fieldsmethod is used to specify the name of the table for which the fields are to be renamed. It is a string type parameter - operation_name: The
operation_nameparameter in thecalculation_rename_info_fieldsmethod is a string that specifies the name of the operation being performed. In this context, it is used as a default value for the operation name if not explicitly provided when calling the function, defaults to RENAME_INFO_FIELDS